def test_gen_data(self): TeachAIKB().construct_kb()
def build_artificial_dataset(self, args): # examples_meta is a pandas DataFrame that contain all examples with additional meta data for # the task, and will be automatically save as "..._meta.jsonl" file with the artiset files self.examples_meta = [] random.seed(17) logger.info("building examples") # Find good hypernym candidates (fixed list for now) hypernyms = {} dev_objects = [ 'tree', 'flower', 'fruit', 'music', 'bird', 'alcohol', 'plant' ] # Sampling from hypernyms in our dataset that are true hypernyms['dev'] = TeachAIKB().sample( { 'predicate': ['hypernym'], 'source_not_in': ['wikidata'], 'object': dev_objects, 'validity': ['always true'] }, tar_tag='implicit_rule') hypernyms['train'] = TeachAIKB().sample( { 'predicate': ['hypernym'], 'source_not_in': ['wikidata'], 'object_not_in': dev_objects, 'validity': ['always true'] }, tar_tag='implicit_rule') for split, bridge_rules in hypernyms.items(): if args.split is not None and split != args.split: continue logger.info(f'--------------- {split} ------------------') # To have only true rules, "never true" will be applied negative language: "never true" + negative = "always true" # Adding a property for each hypernym object ("animal") --> "animals are capable of living" hypernym_property_positive = TeachAIKB().connect(connect_to=bridge_rules, max_to_connect = 5, constraints={'validity': 'always true', 'predicate_not_in': ['hypernym']}, \ src_tags=['implicit_rule'], connection_point=[{'object':'subject'}], tar_tag='property') # now that we have positive example we will try to create negative examples that mimic the distribution # of the positive ones. hypernym_property_negative = self.self_negative_subject_sample(hypernym_property_positive, sample_on='property',\ avoid_mixing='hyponyms') # creating a statement by applying downward monotonicity to the hypernym and it's property. examples = TeachAIKB().connect_downward_monotone( connect_to=hypernym_property_positive + hypernym_property_negative, scope='implicit_rule', property='property', tar_tag='statement') # Sampling distractors. "[(src_tag, src_fields, num_to_sample, exactly_sample_num, fields_to_take, balance_with_statement)] self.sample_distractors( examples, tar_tag='distractors', sample=[ ('property', ['predicate', 'object' ], 2, True, ['implicit_rule', 'property'], True if args.variant != 'statement_only' else False), ('statement', ['predicate'], 2, False, ['statement'], False), ('statement', ['subject'], 2, False, ['statement'], False) ]) if True: # Change condition to config flag if need be # Change implicit distractor to be the negative statement for main subject # E.g., instead of "salmon is fish" distractor for "whale is mammal", we make it "whale is not fish" for e in examples: dist = copy.deepcopy(e['distractors']['implicit_rule'][0]) dist['subject'] = e['implicit_rule']['subject'] dist['validity'] = 'never true' e['distractors']['implicit_rule'] = [dist] # mixing 10% of the implicit rule as statement in the training mix only. if args.variant == 'training_mix' and split == 'train': examples += [{ 'statement': e['implicit_rule'] } for e in random.sample(bridge_rules, int(0.05 * float(len(examples))))] negative_bridge = self.self_negative_subject_sample( bridge_rules, sample_on='implicit_rule', avoid_mixing=['hyponyms']) examples += [{ 'statement': e['implicit_rule'] } for e in random.sample(negative_bridge, int(0.05 * float(len(examples))))] # for each variation, the proportion in which each rule type will be filtered. ablations = { 'training_mix': [(['implicit_rule'], 0.5), (['property'], 0), (['distractors'], 0.2)], 'statement_only': [(['implicit_rule'], 1), (['property'], 1)], 'explicit_only': [(['implicit_rule'], 0), (['property'], 0)], 'corrected_only': [(['implicit_rule'], 0), (['property'], 0)], 'corrected_inv_only': [(['implicit_rule'], 0), (['property'], 0)], 'implicit_only': [(['implicit_rule'], 1), (['property'], 0)], 'statement_only_no_context': [(['implicit_rule'], 1), (['property'], 1), (['distractors'], 1)], 'statement_subject_lang_selectivity': [(['implicit_rule'], 1), (['property'], 0)], 'implicit_knowledge_test': [(['statement'], 1), (['implicit_rule'], 0), (['property'], 1), (['distractors'], 1)], 'implicit_knowledge_distractor_test': [(['statement'], 1), (['implicit_rule'], 1), (['property'], 1), (['distractors'], 1)], } if args.variant == "corrected_only" or args.variant == "corrected_inv_only": # In these variants we update the examples to delete implicit_rule edges according to # absence ("corrected_only") or presence ("corrected_inv_only") in self._incorrect_beliefs # So "corrected_only" means that context include implicit rules that a model got wrong. def edge_tuple(e): return (e['subject'], e['predicate'], e['object'], e['validity']) incorrect_beliefs_set = { edge_tuple(e) for e in self._incorrect_beliefs } def delete_edge(e): tuple = edge_tuple(e) if args.variant == "corrected_only": return tuple not in incorrect_beliefs_set else: return tuple in incorrect_beliefs_set for e in examples: if delete_edge(e['implicit_rule']): del e['implicit_rule'] dist = e['distractors'] if delete_edge(dist['implicit_rule'][0]): del dist['implicit_rule'] if args.variant == 'implicit_knowledge_test': self.build_statement_rule_property_examples( examples, split=split, statement_tag='implicit_rule', rule_tags=[], distractor_tags=[]) elif args.variant == 'implicit_knowledge_distractor_test': for e in examples: e['distractor_implicit_rule'] = copy.deepcopy( e['implicit_rule']) e['distractor_implicit_rule']['object'] = e['distractors'][ 'implicit_rule'][0]['object'] e['distractor_implicit_rule']['validity'] = 'never true' self.build_statement_rule_property_examples( examples, split=split, statement_tag='distractor_implicit_rule', rule_tags=[], distractor_tags=[]) # Actively splitting between test and dev (50/50) if split == 'dev': # making sure that for the same amount of examples the split will always be the same. random.seed(17) all_inds = [i for i in range(len(examples))] dev_inds = random.sample(all_inds, int(len(all_inds) / 2)) test_inds = list(set(all_inds) - set(dev_inds)) splits = [('dev', [examples[i] for i in dev_inds]), ('test', [examples[i] for i in test_inds])] else: splits = [('train', examples)] for final_split, final_examples in splits: if args.variant == 'implicit_knowledge_test': self.build_statement_rule_property_examples( final_examples, split=final_split, statement_tag='implicit_rule', rule_tags=[], distractor_tags=[]) elif args.variant == 'implicit_knowledge_distractor_test': for e in final_examples: e['distractor_implicit_rule'] = copy.deepcopy( e['implicit_rule']) e['distractor_implicit_rule']['object'] = e[ 'distractors']['implicit_rule'][0]['object'] e['distractor_implicit_rule'][ 'validity'] = 'never true' self.build_statement_rule_property_examples( final_examples, split=final_split, statement_tag='distractor_implicit_rule', rule_tags=[], distractor_tags=[]) else: self.build_statement_rule_property_examples( final_examples, split=final_split, ablation_list=ablations[args.variant]) self.print_examples(20) self.print_stats() self.examples_meta = pd.DataFrame(self.examples_meta) self.save_dataset()
def connect_negative_shuffle_subject(self, shuffle, shuffle_on, tar_tag, avoid_mixing=None): logger.info(f'connect_negative_shuffle_subject {tar_tag}') # We assume shuffle_on is only one field (usueally predicate or object) # Finding "clusters" that may not be shuffled internally when producing negative examples # (because the have downword monotone relations) connect_to = deepcopy(shuffle) triplets_to_shuffle_df = pd.DataFrame( ([e[shuffle_on] for e in shuffle])) field_to_shuffle_counts = triplets_to_shuffle_df[ 'subject'].value_counts() subjects_to_shuffle = set(triplets_to_shuffle_df['subject']) remaining_inds_to_choose = set(triplets_to_shuffle_df.index) for curr_subject, size in field_to_shuffle_counts.iteritems(): potential_target_inds = deepcopy(remaining_inds_to_choose) tar_subjects = subjects_to_shuffle - {curr_subject} tar_subjects -= { e['subject'] for e in TeachAIKB().sample({ 'predicate': 'hypernym', 'object': curr_subject }) } if avoid_mixing is not None and 'co-hyponyms' in avoid_mixing: subject_is_hyponym_of = { e['object'] for e in TeachAIKB().sample({ 'subject': curr_subject, 'predicate': 'hypernym' }) } tar_subjects -= { e['subject'] for e in TeachAIKB().sample( { 'predicate': 'hypernym', 'object': list(subject_is_hyponym_of) }) } if avoid_mixing is not None and 'co-meronyms' in avoid_mixing: subject_is_meronym_of = { e['subject'] for e in self.sample({ 'predicate': 'meronym', 'object': curr_subject }) } tar_subjects -= { e['object'] for e in self.sample({ 'predicate': 'meronym', 'subject': list(subject_is_meronym_of) }) } potential_target_inds &= set(triplets_to_shuffle_df[ triplets_to_shuffle_df['subject'].isin(tar_subjects)].index) targets = [ e for e in connect_to if e[shuffle_on]['subject'] == curr_subject ] selected_inds = [] for i in random.sample(potential_target_inds, len(potential_target_inds)): new_edge = { 'subject': curr_subject, 'predicate': triplets_to_shuffle_df.loc[i, 'predicate'], 'object': triplets_to_shuffle_df.loc[i, 'object'] } # checking if there is no triplet that is true with the same values: matching_edges_in_kb = self.lookup(new_edge) if len(matching_edges_in_kb) == 0: targets[len(selected_inds)][tar_tag] = new_edge targets[len(selected_inds)][tar_tag].update( {'validity': 'never true'}) selected_inds.append(i) if len(selected_inds) >= len(targets): break if len(selected_inds) < len(targets): logger.debug( f'did not find enough for {curr_subject}: {len(selected_inds)} found, {len(targets)} required' ) else: logger.debug(f'{curr_subject}: {len(selected_inds)} found.') remaining_inds_to_choose -= set(selected_inds) return connect_to
def create_subject_filter_lookup(self, examples, sample_on=None, avoid_mixing=None): if sample_on is not None: triplets_to_sample_on = [e[sample_on] for e in examples] else: triplets_to_sample_on = examples # building subject filter lookup: subject_filter_lookup = {} rules_to_sample_df = pd.DataFrame(triplets_to_sample_on) for curr_subject, matching_records in tqdm( rules_to_sample_df.groupby('subject')): subject_to_filter = {curr_subject} if avoid_mixing is not None and 'predicates' in avoid_mixing: subject_to_filter |= set( rules_to_sample_df[~rules_to_sample_df['predicate'].isin( set(matching_records['predicate']))]['subject']) if avoid_mixing is not None and 'hyponyms' in avoid_mixing: subject_to_filter |= { e['subject'] for e in TeachAIKB().sample({ 'predicate': 'hypernym', 'object': curr_subject }) } if avoid_mixing is not None and 'co-hyponyms' in avoid_mixing: subject_is_hyponym_of = { e['object'] for e in TeachAIKB().sample({ 'subject': curr_subject, 'predicate': 'hypernym' }) } subject_to_filter |= { e['subject'] for e in TeachAIKB().sample( { 'predicate': 'hypernym', 'object': list(subject_is_hyponym_of) }) } if avoid_mixing is not None and 'co-meronyms' in avoid_mixing: subject_is_meronym_of = { e['subject'] for e in TeachAIKB().sample({ 'predicate': 'meronym', 'object': curr_subject }) } subject_to_filter |= { e['object'] for e in TeachAIKB().sample( { 'predicate': 'meronym', 'subject': list(subject_is_meronym_of) }) } subject_filter_lookup[curr_subject] = subject_to_filter return subject_filter_lookup
def build_statement_rule_property_examples(self, examples, split, statement_tag='statement', ablate_same_distractor_fields = 1.0,\ rule_tags=['implicit_rule','property'], distractor_tags = ['distractors'], ablation_list=[], use_shorthand=False, \ nlg_sampling=False, reverse_validity_frac=0): # computing ID before ablations on the statement and rule tags: for i, example in enumerate(examples): m = hashlib.md5() # note that the tags for ID creation are always the same! for tag in [statement_tag] + rule_tags: if tag in example: if type(example[tag]) == list: for e in example[tag]: m.update(e['subject'].encode()) m.update(e['predicate'].encode()) m.update(e['object'].encode()) m.update(e['validity'].encode()) else: m.update(example[tag]['subject'].encode()) m.update(example[tag]['predicate'].encode()) m.update(example[tag]['object'].encode()) m.update(example[tag]['validity'].encode()) example['id'] = m.hexdigest() # Ablations # now that all the examples are ready, we can ablate as needed: random.seed(17) for ablation in ablation_list: if len(ablation) == 3: fields, fraction, condition = ablation examples_cands = [ e for e in examples if e[condition[0]] in condition[1] ] else: fields, fraction = ablation examples_cands = examples example_to_ablate = random.sample( examples_cands, int(fraction * float(len(examples)))) for e in example_to_ablate: for field in fields: if field in e: del e[field] # for every field we ablate we must ablate the same field from distractors! if random.random() < ablate_same_distractor_fields: for distractor_tag in distractor_tags: if distractor_tag in e: if field in e[distractor_tag]: del e[distractor_tag][field] random.seed(17) for i, example in enumerate(examples): context_rules = [] # adding actual rules for rule_tag in rule_tags: if rule_tag in example: rules = example[rule_tag] if not type(rules) == list: rules = [rules] for rule in rules: reverse_validity = not rule['validity'] == 'always true' context_rules.append(TeachAIKB().to_pseudo_language( rule, is_rule=True, reverse_validity=reverse_validity, use_shorthand=use_shorthand, nlg_sampling=nlg_sampling)) # adding distractors for rule_tag in distractor_tags: if rule_tag in example: for field, tag_distractors in example[rule_tag].items(): for rule in tag_distractors: rule_list = rule if not type(rule_list) == list: rule_list = [rule_list] for r in rule_list: reverse_validity = not r[ 'validity'] == 'always true' context_rules.append( TeachAIKB().to_pseudo_language( r, is_rule=True, reverse_validity=reverse_validity, use_shorthand=use_shorthand, nlg_sampling=nlg_sampling)) use_hypothetical_statement = False if 'is_hypothetical_statement' in example and example[ 'is_hypothetical_statement']: use_hypothetical_statement = True answer = 1 if example[statement_tag][ 'validity'] == 'always true' else 0 if self.variant != 'statement_subject_lang_selectivity': if random.random() < reverse_validity_frac: answer = 1 - answer reverse_validity = True else: reverse_validity = False phrase = TeachAIKB().to_pseudo_language( example[statement_tag], is_rule=False, use_shorthand=use_shorthand, use_hypothetical_statement=use_hypothetical_statement, nlg_sampling=nlg_sampling, reverse_validity=reverse_validity) else: statement_dict = deepcopy(example[statement_tag]) statement_dict['subject'] = random.sample([ 'foo', 'blah', 'ya', 'qux', 'aranglopa', 'foltopia', 'cakophon', 'baz', 'garply' ], 1)[0] phrase = TeachAIKB().to_pseudo_language( statement_dict, is_rule=False, use_shorthand=use_shorthand, use_hypothetical_statement=use_hypothetical_statement, nlg_sampling=nlg_sampling) # creating a unique set of rules that does not include the statement. context_rules = list(set(context_rules)) # set order is random!! so we need to fix the order the get a replicable order. context_rules = sorted(context_rules) random.shuffle(context_rules) example.update({'phrase': phrase, \ 'answer': answer, 'context': ' '.join(context_rules), 'split': split, 'rules': context_rules}) # append_teachyourai_format_example() is method implemented in ArtiSet class and takes an example dict # (that must contain a "phrase", "answer") and converts it to a BooleanQA format self.append_teachyourai_format_example(example, do_print=False) self.examples_meta.append(deepcopy(example))
def build_artificial_dataset(self, args): # examples_meta is a pandas DataFrame that contain all examples with additional meta data for # the task, and will be automatically save as "..._meta.jsonl" file with the artiset files self.examples_meta = [] random.seed(17) logger.info("building examples") # Find good meronym candidates (fixed list for now) meronyms = {} #devset_subjects = [e['subject'] for e in TeachAIKB().sample({'predicate': ['hypernym'], \ # 'object': ['food', 'vehicle', 'road', 'clothing', \ # 'instrument','commodity','activity'], 'validity': ['always true']})] # Sampling from meronyms in our dataset that are true meronyms['dev'] = TeachAIKB().sample({'predicate':['meronym'], 'object_not_in': ['head','storey','sauce','face','room','blossom',\ 'bedroom','sandwich','skull','doorway','hull'], 'validity':['always true']}, sample_limit=('object', 70), tar_tag='implicit_rule') #meronyms['train'] = TeachAIKB().sample({'predicate': ['meronym', 'part of'], # #'subject_not_in': devset_subjects, # 'validity': ['always true']}, sample_limit=('object', 70), tar_tag='implicit_rule') for split, bridge_rules in meronyms.items(): if args.split is not None and split != args.split: continue logger.info(f'--------------- {split} ------------------') # To have only true rules, "never true" will be applied negative language: "never true" + negative = "always true" # Adding a property for each meronym object ("animal") --> "animals are capable of living" meronym_property_positive = TeachAIKB().connect(connect_to=bridge_rules, max_to_connect = 5, constraints={'validity': 'always true', 'predicate': ['meronym','part of']}, \ src_tags=['implicit_rule'], connection_point=[{'object':'subject'}], tar_tag='property') # now that we have positive example we will try to create negative examples that mimic the distribution # of the positive ones. #meronym_property_negative = TeachAIKB().connect_negative_shuffle_subject(shuffle=meronym_property_positive, \ # shuffle_on='property', tar_tag='property', avoid_mixing=['co-meronyms']) meronym_property_negative = self.self_negative_subject_sample(meronym_property_positive, sample_on='property', \ avoid_mixing=['co-meronyms','hyponyms']) # creating a statement by applying downward monotonicity to the meronym and it's property. examples = TeachAIKB().connect_downward_monotone( connect_to=meronym_property_positive + meronym_property_negative, scope='implicit_rule', property='property', tar_tag='statement') # Sampling distractors. "('statement', ['predicate', 'object'], 2)" means for each example, sample at most two statements # with the same ['predicate', 'object'] as the example statement and add to distractors. self.sample_distractors(examples, sample=[ ('property', ['predicate','object'], 2, True, ['implicit_rule','property'], \ True if args.variant != 'statement_only' else False), ('statement', ['subject'], 2, False, ['statement'], False), ('statement', ['predicate'], 2, False, ['statement'],False)], tar_tag='distractors') if True: # Change condition to config flag if need be # Change implicit distractor to be the negative statement for main subject # E.g., instead of "salmon has eye" distractor for "house has door", we make it "house does not have eye" for e in examples: dist = copy.deepcopy(e['distractors']['implicit_rule'][0]) dist['subject'] = e['implicit_rule']['subject'] dist['validity'] = 'never true' e['distractors']['implicit_rule'] = [dist] # mixing 10% of the implicit rule as statement in the training mix only. if args.variant == 'training_mix': examples += [{ 'statement': e['implicit_rule'] } for e in random.sample(bridge_rules, int(0.05 * float(len(examples))))] negative_bridge = self.self_negative_subject_sample( bridge_rules, sample_on='implicit_rule', avoid_mixing=['co-meronyms']) examples += [{ 'statement': e['implicit_rule'] } for e in random.sample(negative_bridge, int(0.05 * float(len(examples))))] # for each variation, the proportion in which each rule type will be filtered. ablations = { 'training_mix': [(['implicit_rule'], 0.5), (['property'], 0), (['distractors'], 0.2)], 'statement_only': [(['implicit_rule'], 1), (['property'], 1)], 'explicit_only': [(['implicit_rule'], 0), (['property'], 0)], 'corrected_only': [(['implicit_rule'], 0), (['property'], 0)], 'corrected_inv_only': [(['implicit_rule'], 0), (['property'], 0)], 'implicit_only': [(['implicit_rule'], 1), (['property'], 0)], 'statement_only_no_context': [(['implicit_rule'], 1), (['property'], 1), (['distractors'], 1)], 'statement_subject_lang_selectivity': [(['implicit_rule'], 1), (['property'], 0)], 'implicit_knowledge_test': [(['statement'], 1), (['implicit_rule'], 0), (['property'], 1), (['distractors'], 1)], 'implicit_knowledge_distractor_test': [(['statement'], 1), (['implicit_rule'], 1), (['property'], 1), (['distractors'], 1)], } if args.variant == "corrected_only" or args.variant == "corrected_inv_only": # In these variants we update the examples to delete implicit_rule edges according to # absence ("corrected_only") or presence ("corrected_inv_only") in self._incorrect_beliefs # So "corrected_only" means that context include implicit rules that a model got wrong. def edge_tuple(e): return (e['subject'], e['predicate'], e['object'], e['validity']) incorrect_beliefs_set = { edge_tuple(e) for e in self._incorrect_beliefs } def delete_edge(e): tuple = edge_tuple(e) if args.variant == "corrected_only": return tuple not in incorrect_beliefs_set else: return tuple in incorrect_beliefs_set for e in examples: if delete_edge(e['implicit_rule']): del e['implicit_rule'] dist = e['distractors'] if delete_edge(dist['implicit_rule'][0]): del dist['implicit_rule'] if args.variant == 'implicit_knowledge_test': self.build_statement_rule_property_examples( examples, split=split, statement_tag='implicit_rule', rule_tags=[], distractor_tags=[]) elif args.variant == 'implicit_knowledge_distractor_test': for e in examples: e['distractor_implicit_rule'] = copy.deepcopy( e['implicit_rule']) e['distractor_implicit_rule']['object'] = e['distractors'][ 'implicit_rule'][0]['object'] e['distractor_implicit_rule']['validity'] = 'never true' self.build_statement_rule_property_examples( examples, split=split, statement_tag='distractor_implicit_rule', rule_tags=[], distractor_tags=[]) else: self.build_statement_rule_property_examples( examples, split=split, ablation_list=ablations[args.variant]) self.print_examples(20) self.print_stats() self.examples_meta = pd.DataFrame(self.examples_meta) self.save_dataset()
def build_artificial_dataset(self, args): # examples_meta is a pandas DataFrame that contain all examples with additional meta data for # the task, and will be automatically save as "..._meta.jsonl" file with the artiset files self.examples_meta = [] random.seed(17) logger.info("building examples") # Querying all functional relations from KB. func_rel_triplets = TeachAIKB().sample({'predicate': ['super bowl loser', 'super bowl winner', 'band member', 'capital', \ 'director', 'release year', 'founder', 'headquarter', 'child', 'spouse', 'CEO'], \ 'source': ['wikidata']}, select=['subject', 'predicate', 'object', 'validity']) # split dev and train function_relations_split = {'dev': [], 'train': []} for e in func_rel_triplets: if sum(bytearray(e['object'].encode())) % 7 == 0 or e['object'] in ['Germany', 'United States of America', 'Israel']: function_relations_split['dev'].append(e) else: function_relations_split['train'].append(e) for split, function_relations in function_relations_split.items(): if args.split is not None and split != args.split: continue logger.info(f'--------------- {split} ------------------') function_relations_neg = self.self_negative_subject_sample(function_relations, avoid_mixing='predicates', \ over_sample=2) # In counting we have "counted instances" which are the actual instances to be counted # these are collected using aggregation of predicate and object. counted_instances = pd.DataFrame(function_relations).groupby(['predicate', 'object']).apply(lambda x: \ (x.to_dict(orient='rows'), len(x))) false_statements = pd.DataFrame(function_relations_neg).groupby(['predicate', 'object']).apply(lambda x: \ x.to_dict(orient='rows')) examples = self.building_balanced_counting_examples(counted_instances, false_statements, split) # creating the statement # creating implicit_rule (implicit rules in counting are "entitiy is a company") # examples = TeachAIKB().connect(connect_to=statements_with_instances, # constraints={'validity': 'always true', 'predicate': ['hypernym']}, \ # src_tags=['statement'], connection_point=[{'object': 'subject'}], # tar_tag='implicit_rule') # creating the hypernym count property (every company has 2 founders) # for e in examples: # e['hypernym_count_property'] = {'subject': e['implicit_rule']['object'], 'predicate': 'has 1', # 'object': e['statement']['predicate'], 'validity': 'always true'} if args.variant in ['training_mix_lv_oversample_neg','training_mix_lv_oversample','training_mix_oversample']: examples = pd.Series(examples).sample(frac=1.5, random_state=17, replace=True).to_list() # Sampling distractors. "[(src_tag, src_fields, num_to_sample, exactly_sample_num, fields_to_take, balance_with_statement)] if args.variant in ['training_mix_lv', 'training_mix_lv_oversample', 'training_mix_lv_oversample_neg','training_mix_oversample']: self.sample_distractors(examples, tar_tag='distractors', sample=[ ('statement', ['predicate'], 1, True, ['counted_instances', 'total_count_rule'], None), ('statement', [], 2, False, ['counted_instances', 'total_count_rule'], None)]) else: self.sample_distractors(examples, tar_tag='distractors', sample=[ ('statement', ['predicate'], 2, True, ['counted_instances', 'total_count_rule'], False), ('statement', [], 2, False, ['counted_instances', 'total_count_rule'], False)]) # ('implicit_rule', ['predicate'], 2, False, ['implicit_rule'], False)]) # Hypothetical statements vs real fact statements hypothetical_portion = {'training_mix': 0, 'training_mix_with_hypothetical': 0.3, 'hypothetical_only': 1, \ 'hypothetical_true_label': 1} if args.variant not in hypothetical_portion: hypothetical_portion[args.variant] = 0 # hypothtical_inds = random.sample(range(len(examples)), int(hypothetical_portion[args.variant] * len(examples))) hypothtical_inds = pd.Series(range(len(examples))).sample(frac=hypothetical_portion[args.variant], random_state=17).to_list() for i, e in enumerate(examples): if i in hypothtical_inds: e['is_hypothetical_statement'] = True # In the hypothetical case, when the instance count is strictly less than the total number of instances, # the hypothetical_statement becomes true. if len(e['counted_instances']) < e['total_num_of_instances'] and not args.variant == 'hypothetical_true_label': e['statement']['validity'] = 'always true' else: e['is_hypothetical_statement'] = False # mixing 10% of the statements with no context in the training mix only. if args.variant in ['training_mix', 'training_mix_lv', 'training_mix_lv_oversample','training_mix_lv_oversample_neg',\ 'training_mix_with_hypothetical','training_mix_oversample']: examples += [{'statement': e['statement']} for e in random.sample([e for e in examples \ if e['statement']['validity'] == 'never true'], int(0.05 * float(len(examples))))] examples += [{'statement': e['statement']} for e in random.sample([e for e in examples \ if e['statement']['validity'] == 'always true'], int(0.05 * float(len(examples))))] # for each variation, the proportion in which each rule type will be filtered. ablations = { 'training_mix': [(['distractors'], 0.2), (['total_count_rule'], 0.05)], 'training_mix_no_total': [(['distractors'], 0.2), (['total_count_rule'], 1)], 'no_total': [(['total_count_rule'], 1)], 'training_mix_lv': [(['distractors'], 0.2), (['total_count_rule'], 0.05)], 'training_mix_oversample': [(['distractors'], 0.2), (['total_count_rule'], 0.05)], 'training_mix_lv_oversample': [(['distractors'], 0.2), (['total_count_rule'], 0.05)], 'training_mix_lv_oversample_neg': [(['distractors'], 0.2), (['total_count_rule'], 0.05)], 'training_mix_with_hypothetical': [(['distractors'], 0.2), (['total_count_rule'], 0.05)], 'statement_only': [(['counted_instances', 'total_count_rule'], 1), (['distractors'], 1)], 'statement_only_with_distractors': [(['counted_instances', 'total_count_rule'], 1)], } if args.variant not in ablations: ablations[args.variant] = [] if args.variant in ['no_total','training_mix_no_total']: for e in examples: del e['distractors']['total_count_rule'] # Actively splitting between test and dev (50/50) if split == 'dev': # making sure that for the same amount of examples the split will always be the same. random.seed(71) all_inds = [i for i in range(len(examples))] dev_inds = random.sample(all_inds, int(len(all_inds) / 2)) test_inds = list(set(all_inds) - set(dev_inds)) splits = [('dev', [examples[i] for i in dev_inds]), ('test', [examples[i] for i in test_inds])] else: splits = [('train', examples)] for final_split, final_examples in splits: if args.experiment_version in ['count_reached','count_reached_mix']: final_examples = [e for e in final_examples if len(e['counted_instances']) == e['total_num_of_instances']] elif args.experiment_version in ['count_not_reached']: final_examples = [e for e in final_examples if len(e['counted_instances']) < e['total_num_of_instances']] elif args.experiment_version == 'between_one_and_full_counted_instances': final_examples = [e for e in final_examples if len(e['counted_instances']) > 0 and \ len(e['counted_instances']) < e['total_num_of_instances']] self.build_statement_rule_property_examples(final_examples, split=final_split, \ rule_tags=['counted_instances', 'total_count_rule'], ablate_same_distractor_fields=False, \ nlg_sampling=True if args.variant in ['training_mix_lv','training_mix_lv_oversample',\ 'training_mix_lv_oversample_neg'] else False,\ ablation_list=ablations[args.variant], use_shorthand=True, \ reverse_validity_frac=0.3 if args.variant == 'training_mix_lv_oversample_neg' else 0) self.print_examples(30) self.print_stats() self.examples_meta = pd.DataFrame(self.examples_meta) self.save_dataset()