Пример #1
0
    def test_gen_data(self):

        TeachAIKB().construct_kb()
Пример #2
0
    def build_artificial_dataset(self, args):
        # examples_meta is a pandas DataFrame that contain all examples with additional meta data for
        # the task, and will be automatically save as "..._meta.jsonl" file with the artiset files
        self.examples_meta = []
        random.seed(17)

        logger.info("building examples")

        # Find good hypernym candidates (fixed list for now)
        hypernyms = {}
        dev_objects = [
            'tree', 'flower', 'fruit', 'music', 'bird', 'alcohol', 'plant'
        ]
        # Sampling from hypernyms in our dataset that are true
        hypernyms['dev'] = TeachAIKB().sample(
            {
                'predicate': ['hypernym'],
                'source_not_in': ['wikidata'],
                'object': dev_objects,
                'validity': ['always true']
            },
            tar_tag='implicit_rule')
        hypernyms['train'] = TeachAIKB().sample(
            {
                'predicate': ['hypernym'],
                'source_not_in': ['wikidata'],
                'object_not_in': dev_objects,
                'validity': ['always true']
            },
            tar_tag='implicit_rule')

        for split, bridge_rules in hypernyms.items():
            if args.split is not None and split != args.split:
                continue

            logger.info(f'---------------  {split} ------------------')
            # To have only true rules, "never true" will be applied negative language: "never true" + negative = "always true"
            # Adding a property for each hypernym object ("animal") --> "animals are capable of living"
            hypernym_property_positive = TeachAIKB().connect(connect_to=bridge_rules, max_to_connect = 5,
                                    constraints={'validity': 'always true', 'predicate_not_in': ['hypernym']}, \
                                    src_tags=['implicit_rule'], connection_point=[{'object':'subject'}], tar_tag='property')

            # now that we have positive example we will try to create negative examples that mimic the distribution
            # of the positive ones.
            hypernym_property_negative = self.self_negative_subject_sample(hypernym_property_positive, sample_on='property',\
                                                                           avoid_mixing='hyponyms')

            # creating a statement by applying downward monotonicity to the hypernym and it's property.
            examples = TeachAIKB().connect_downward_monotone(
                connect_to=hypernym_property_positive +
                hypernym_property_negative,
                scope='implicit_rule',
                property='property',
                tar_tag='statement')

            # Sampling distractors.  "[(src_tag, src_fields, num_to_sample, exactly_sample_num, fields_to_take, balance_with_statement)]
            self.sample_distractors(
                examples,
                tar_tag='distractors',
                sample=[
                    ('property', ['predicate', 'object'
                                  ], 2, True, ['implicit_rule', 'property'],
                     True if args.variant != 'statement_only' else False),
                    ('statement', ['predicate'], 2, False, ['statement'],
                     False),
                    ('statement', ['subject'], 2, False, ['statement'], False)
                ])
            if True:  # Change condition to config flag if need be
                # Change implicit distractor to be the negative statement for main subject
                # E.g., instead of "salmon is fish" distractor for "whale is mammal", we make it "whale is not fish"
                for e in examples:
                    dist = copy.deepcopy(e['distractors']['implicit_rule'][0])
                    dist['subject'] = e['implicit_rule']['subject']
                    dist['validity'] = 'never true'
                    e['distractors']['implicit_rule'] = [dist]

            # mixing 10% of the implicit rule as statement in the training mix only.
            if args.variant == 'training_mix' and split == 'train':
                examples += [{
                    'statement': e['implicit_rule']
                } for e in random.sample(bridge_rules,
                                         int(0.05 * float(len(examples))))]
                negative_bridge = self.self_negative_subject_sample(
                    bridge_rules,
                    sample_on='implicit_rule',
                    avoid_mixing=['hyponyms'])
                examples += [{
                    'statement': e['implicit_rule']
                } for e in random.sample(negative_bridge,
                                         int(0.05 * float(len(examples))))]

            # for each variation, the proportion in which each rule type will be filtered.
            ablations = {
                'training_mix': [(['implicit_rule'], 0.5), (['property'], 0),
                                 (['distractors'], 0.2)],
                'statement_only': [(['implicit_rule'], 1), (['property'], 1)],
                'explicit_only': [(['implicit_rule'], 0), (['property'], 0)],
                'corrected_only': [(['implicit_rule'], 0), (['property'], 0)],
                'corrected_inv_only': [(['implicit_rule'], 0),
                                       (['property'], 0)],
                'implicit_only': [(['implicit_rule'], 1), (['property'], 0)],
                'statement_only_no_context': [(['implicit_rule'], 1),
                                              (['property'], 1),
                                              (['distractors'], 1)],
                'statement_subject_lang_selectivity': [(['implicit_rule'], 1),
                                                       (['property'], 0)],
                'implicit_knowledge_test': [(['statement'], 1),
                                            (['implicit_rule'], 0),
                                            (['property'], 1),
                                            (['distractors'], 1)],
                'implicit_knowledge_distractor_test': [(['statement'], 1),
                                                       (['implicit_rule'], 1),
                                                       (['property'], 1),
                                                       (['distractors'], 1)],
            }

            if args.variant == "corrected_only" or args.variant == "corrected_inv_only":
                # In these variants we update the examples to delete implicit_rule edges according to
                # absence ("corrected_only") or presence ("corrected_inv_only") in self._incorrect_beliefs
                # So "corrected_only" means that context include implicit rules that a model got wrong.
                def edge_tuple(e):
                    return (e['subject'], e['predicate'], e['object'],
                            e['validity'])

                incorrect_beliefs_set = {
                    edge_tuple(e)
                    for e in self._incorrect_beliefs
                }

                def delete_edge(e):
                    tuple = edge_tuple(e)
                    if args.variant == "corrected_only":
                        return tuple not in incorrect_beliefs_set
                    else:
                        return tuple in incorrect_beliefs_set

                for e in examples:
                    if delete_edge(e['implicit_rule']):
                        del e['implicit_rule']
                    dist = e['distractors']
                    if delete_edge(dist['implicit_rule'][0]):
                        del dist['implicit_rule']

            if args.variant == 'implicit_knowledge_test':
                self.build_statement_rule_property_examples(
                    examples,
                    split=split,
                    statement_tag='implicit_rule',
                    rule_tags=[],
                    distractor_tags=[])
            elif args.variant == 'implicit_knowledge_distractor_test':
                for e in examples:
                    e['distractor_implicit_rule'] = copy.deepcopy(
                        e['implicit_rule'])
                    e['distractor_implicit_rule']['object'] = e['distractors'][
                        'implicit_rule'][0]['object']
                    e['distractor_implicit_rule']['validity'] = 'never true'
                self.build_statement_rule_property_examples(
                    examples,
                    split=split,
                    statement_tag='distractor_implicit_rule',
                    rule_tags=[],
                    distractor_tags=[])

            # Actively splitting between test and dev (50/50)
            if split == 'dev':
                # making sure that for the same amount of examples the split will always be the same.
                random.seed(17)
                all_inds = [i for i in range(len(examples))]
                dev_inds = random.sample(all_inds, int(len(all_inds) / 2))
                test_inds = list(set(all_inds) - set(dev_inds))
                splits = [('dev', [examples[i] for i in dev_inds]),
                          ('test', [examples[i] for i in test_inds])]
            else:
                splits = [('train', examples)]

            for final_split, final_examples in splits:
                if args.variant == 'implicit_knowledge_test':
                    self.build_statement_rule_property_examples(
                        final_examples,
                        split=final_split,
                        statement_tag='implicit_rule',
                        rule_tags=[],
                        distractor_tags=[])
                elif args.variant == 'implicit_knowledge_distractor_test':
                    for e in final_examples:
                        e['distractor_implicit_rule'] = copy.deepcopy(
                            e['implicit_rule'])
                        e['distractor_implicit_rule']['object'] = e[
                            'distractors']['implicit_rule'][0]['object']
                        e['distractor_implicit_rule'][
                            'validity'] = 'never true'
                    self.build_statement_rule_property_examples(
                        final_examples,
                        split=final_split,
                        statement_tag='distractor_implicit_rule',
                        rule_tags=[],
                        distractor_tags=[])
                else:
                    self.build_statement_rule_property_examples(
                        final_examples,
                        split=final_split,
                        ablation_list=ablations[args.variant])

        self.print_examples(20)
        self.print_stats()
        self.examples_meta = pd.DataFrame(self.examples_meta)
        self.save_dataset()
Пример #3
0
    def connect_negative_shuffle_subject(self,
                                         shuffle,
                                         shuffle_on,
                                         tar_tag,
                                         avoid_mixing=None):
        logger.info(f'connect_negative_shuffle_subject {tar_tag}')
        # We assume shuffle_on is only one field (usueally predicate or object)
        # Finding "clusters" that may not be shuffled internally when producing negative examples
        # (because the have downword monotone relations)
        connect_to = deepcopy(shuffle)
        triplets_to_shuffle_df = pd.DataFrame(
            ([e[shuffle_on] for e in shuffle]))
        field_to_shuffle_counts = triplets_to_shuffle_df[
            'subject'].value_counts()
        subjects_to_shuffle = set(triplets_to_shuffle_df['subject'])
        remaining_inds_to_choose = set(triplets_to_shuffle_df.index)

        for curr_subject, size in field_to_shuffle_counts.iteritems():
            potential_target_inds = deepcopy(remaining_inds_to_choose)
            tar_subjects = subjects_to_shuffle - {curr_subject}
            tar_subjects -= {
                e['subject']
                for e in TeachAIKB().sample({
                    'predicate': 'hypernym',
                    'object': curr_subject
                })
            }

            if avoid_mixing is not None and 'co-hyponyms' in avoid_mixing:
                subject_is_hyponym_of = {
                    e['object']
                    for e in TeachAIKB().sample({
                        'subject': curr_subject,
                        'predicate': 'hypernym'
                    })
                }
                tar_subjects -= {
                    e['subject']
                    for e in TeachAIKB().sample(
                        {
                            'predicate': 'hypernym',
                            'object': list(subject_is_hyponym_of)
                        })
                }

            if avoid_mixing is not None and 'co-meronyms' in avoid_mixing:
                subject_is_meronym_of = {
                    e['subject']
                    for e in self.sample({
                        'predicate': 'meronym',
                        'object': curr_subject
                    })
                }
                tar_subjects -= {
                    e['object']
                    for e in self.sample({
                        'predicate': 'meronym',
                        'subject': list(subject_is_meronym_of)
                    })
                }

            potential_target_inds &= set(triplets_to_shuffle_df[
                triplets_to_shuffle_df['subject'].isin(tar_subjects)].index)
            targets = [
                e for e in connect_to
                if e[shuffle_on]['subject'] == curr_subject
            ]
            selected_inds = []
            for i in random.sample(potential_target_inds,
                                   len(potential_target_inds)):
                new_edge = {
                    'subject': curr_subject,
                    'predicate': triplets_to_shuffle_df.loc[i, 'predicate'],
                    'object': triplets_to_shuffle_df.loc[i, 'object']
                }
                # checking if there is no triplet that is true with the same values:
                matching_edges_in_kb = self.lookup(new_edge)
                if len(matching_edges_in_kb) == 0:
                    targets[len(selected_inds)][tar_tag] = new_edge
                    targets[len(selected_inds)][tar_tag].update(
                        {'validity': 'never true'})
                    selected_inds.append(i)
                    if len(selected_inds) >= len(targets):
                        break
            if len(selected_inds) < len(targets):
                logger.debug(
                    f'did not find enough for {curr_subject}: {len(selected_inds)} found, {len(targets)} required'
                )
            else:
                logger.debug(f'{curr_subject}: {len(selected_inds)} found.')

            remaining_inds_to_choose -= set(selected_inds)

        return connect_to
Пример #4
0
    def create_subject_filter_lookup(self,
                                     examples,
                                     sample_on=None,
                                     avoid_mixing=None):
        if sample_on is not None:
            triplets_to_sample_on = [e[sample_on] for e in examples]
        else:
            triplets_to_sample_on = examples

        # building subject filter lookup:
        subject_filter_lookup = {}
        rules_to_sample_df = pd.DataFrame(triplets_to_sample_on)
        for curr_subject, matching_records in tqdm(
                rules_to_sample_df.groupby('subject')):
            subject_to_filter = {curr_subject}

            if avoid_mixing is not None and 'predicates' in avoid_mixing:
                subject_to_filter |= set(
                    rules_to_sample_df[~rules_to_sample_df['predicate'].isin(
                        set(matching_records['predicate']))]['subject'])

            if avoid_mixing is not None and 'hyponyms' in avoid_mixing:
                subject_to_filter |= {
                    e['subject']
                    for e in TeachAIKB().sample({
                        'predicate': 'hypernym',
                        'object': curr_subject
                    })
                }

            if avoid_mixing is not None and 'co-hyponyms' in avoid_mixing:
                subject_is_hyponym_of = {
                    e['object']
                    for e in TeachAIKB().sample({
                        'subject': curr_subject,
                        'predicate': 'hypernym'
                    })
                }
                subject_to_filter |= {
                    e['subject']
                    for e in TeachAIKB().sample(
                        {
                            'predicate': 'hypernym',
                            'object': list(subject_is_hyponym_of)
                        })
                }

            if avoid_mixing is not None and 'co-meronyms' in avoid_mixing:
                subject_is_meronym_of = {
                    e['subject']
                    for e in TeachAIKB().sample({
                        'predicate': 'meronym',
                        'object': curr_subject
                    })
                }
                subject_to_filter |= {
                    e['object']
                    for e in TeachAIKB().sample(
                        {
                            'predicate': 'meronym',
                            'subject': list(subject_is_meronym_of)
                        })
                }
            subject_filter_lookup[curr_subject] = subject_to_filter

        return subject_filter_lookup
Пример #5
0
    def build_statement_rule_property_examples(self, examples, split, statement_tag='statement', ablate_same_distractor_fields = 1.0,\
                rule_tags=['implicit_rule','property'], distractor_tags = ['distractors'], ablation_list=[], use_shorthand=False, \
                                               nlg_sampling=False, reverse_validity_frac=0):

        # computing ID before ablations on the statement and rule tags:
        for i, example in enumerate(examples):
            m = hashlib.md5()
            # note that the tags for ID creation are always the same!
            for tag in [statement_tag] + rule_tags:
                if tag in example:
                    if type(example[tag]) == list:
                        for e in example[tag]:
                            m.update(e['subject'].encode())
                            m.update(e['predicate'].encode())
                            m.update(e['object'].encode())
                            m.update(e['validity'].encode())
                    else:
                        m.update(example[tag]['subject'].encode())
                        m.update(example[tag]['predicate'].encode())
                        m.update(example[tag]['object'].encode())
                        m.update(example[tag]['validity'].encode())
            example['id'] = m.hexdigest()

        # Ablations
        # now that all the examples are ready, we can ablate as needed:
        random.seed(17)
        for ablation in ablation_list:
            if len(ablation) == 3:
                fields, fraction, condition = ablation
                examples_cands = [
                    e for e in examples if e[condition[0]] in condition[1]
                ]
            else:
                fields, fraction = ablation
                examples_cands = examples
            example_to_ablate = random.sample(
                examples_cands, int(fraction * float(len(examples))))
            for e in example_to_ablate:
                for field in fields:
                    if field in e:
                        del e[field]
                    # for every field we ablate we must ablate the same field from distractors!
                    if random.random() < ablate_same_distractor_fields:
                        for distractor_tag in distractor_tags:
                            if distractor_tag in e:
                                if field in e[distractor_tag]:
                                    del e[distractor_tag][field]

        random.seed(17)
        for i, example in enumerate(examples):
            context_rules = []
            # adding actual rules
            for rule_tag in rule_tags:
                if rule_tag in example:
                    rules = example[rule_tag]
                    if not type(rules) == list:
                        rules = [rules]
                    for rule in rules:
                        reverse_validity = not rule['validity'] == 'always true'
                        context_rules.append(TeachAIKB().to_pseudo_language(
                            rule,
                            is_rule=True,
                            reverse_validity=reverse_validity,
                            use_shorthand=use_shorthand,
                            nlg_sampling=nlg_sampling))
            # adding distractors
            for rule_tag in distractor_tags:
                if rule_tag in example:
                    for field, tag_distractors in example[rule_tag].items():
                        for rule in tag_distractors:
                            rule_list = rule
                            if not type(rule_list) == list:
                                rule_list = [rule_list]
                            for r in rule_list:
                                reverse_validity = not r[
                                    'validity'] == 'always true'
                                context_rules.append(
                                    TeachAIKB().to_pseudo_language(
                                        r,
                                        is_rule=True,
                                        reverse_validity=reverse_validity,
                                        use_shorthand=use_shorthand,
                                        nlg_sampling=nlg_sampling))

            use_hypothetical_statement = False
            if 'is_hypothetical_statement' in example and example[
                    'is_hypothetical_statement']:
                use_hypothetical_statement = True

            answer = 1 if example[statement_tag][
                'validity'] == 'always true' else 0

            if self.variant != 'statement_subject_lang_selectivity':

                if random.random() < reverse_validity_frac:
                    answer = 1 - answer
                    reverse_validity = True
                else:
                    reverse_validity = False
                phrase = TeachAIKB().to_pseudo_language(
                    example[statement_tag],
                    is_rule=False,
                    use_shorthand=use_shorthand,
                    use_hypothetical_statement=use_hypothetical_statement,
                    nlg_sampling=nlg_sampling,
                    reverse_validity=reverse_validity)
            else:
                statement_dict = deepcopy(example[statement_tag])
                statement_dict['subject'] = random.sample([
                    'foo', 'blah', 'ya', 'qux', 'aranglopa', 'foltopia',
                    'cakophon', 'baz', 'garply'
                ], 1)[0]
                phrase = TeachAIKB().to_pseudo_language(
                    statement_dict,
                    is_rule=False,
                    use_shorthand=use_shorthand,
                    use_hypothetical_statement=use_hypothetical_statement,
                    nlg_sampling=nlg_sampling)

            # creating a unique set of rules that does not include the statement.
            context_rules = list(set(context_rules))
            # set order is random!! so we need to fix the order the get a replicable order.
            context_rules = sorted(context_rules)
            random.shuffle(context_rules)

            example.update({'phrase': phrase, \
                            'answer': answer,
                            'context': ' '.join(context_rules),
                            'split': split,
                            'rules': context_rules})

            # append_teachyourai_format_example() is method implemented in ArtiSet class and takes an example dict
            # (that must contain a "phrase", "answer") and converts it to a BooleanQA format
            self.append_teachyourai_format_example(example, do_print=False)
            self.examples_meta.append(deepcopy(example))
Пример #6
0
    def build_artificial_dataset(self, args):
        # examples_meta is a pandas DataFrame that contain all examples with additional meta data for
        # the task, and will be automatically save as "..._meta.jsonl" file with the artiset files
        self.examples_meta = []
        random.seed(17)

        logger.info("building examples")

        # Find good meronym candidates (fixed list for now)
        meronyms = {}
        #devset_subjects = [e['subject'] for e in TeachAIKB().sample({'predicate': ['hypernym'], \
        #                    'object': ['food', 'vehicle', 'road', 'clothing', \
        #                               'instrument','commodity','activity'], 'validity': ['always true']})]

        # Sampling from meronyms in our dataset that are true
        meronyms['dev'] = TeachAIKB().sample({'predicate':['meronym'],
                                              'object_not_in': ['head','storey','sauce','face','room','blossom',\
                                                                'bedroom','sandwich','skull','doorway','hull'],
                                              'validity':['always true']}, sample_limit=('object', 70), tar_tag='implicit_rule')
        #meronyms['train'] = TeachAIKB().sample({'predicate': ['meronym', 'part of'],
        #                                        #'subject_not_in': devset_subjects,
        #                                        'validity': ['always true']}, sample_limit=('object', 70), tar_tag='implicit_rule')

        for split, bridge_rules in meronyms.items():
            if args.split is not None and split != args.split:
                continue

            logger.info(f'---------------  {split} ------------------')
            # To have only true rules, "never true" will be applied negative language: "never true" + negative = "always true"
            # Adding a property for each meronym object ("animal") --> "animals are capable of living"
            meronym_property_positive = TeachAIKB().connect(connect_to=bridge_rules, max_to_connect = 5,
                                    constraints={'validity': 'always true', 'predicate': ['meronym','part of']}, \
                                    src_tags=['implicit_rule'], connection_point=[{'object':'subject'}], tar_tag='property')

            # now that we have positive example we will try to create negative examples that mimic the distribution
            # of the positive ones.
            #meronym_property_negative = TeachAIKB().connect_negative_shuffle_subject(shuffle=meronym_property_positive, \
            #                        shuffle_on='property', tar_tag='property', avoid_mixing=['co-meronyms'])
            meronym_property_negative = self.self_negative_subject_sample(meronym_property_positive, sample_on='property', \
                                         avoid_mixing=['co-meronyms','hyponyms'])

            # creating a statement by applying downward monotonicity to the meronym and it's property.
            examples = TeachAIKB().connect_downward_monotone(
                connect_to=meronym_property_positive +
                meronym_property_negative,
                scope='implicit_rule',
                property='property',
                tar_tag='statement')

            # Sampling distractors.  "('statement', ['predicate', 'object'], 2)" means for each example, sample at most two statements
            # with the same ['predicate', 'object'] as the example statement and add to distractors.
            self.sample_distractors(examples, sample=[
                ('property', ['predicate','object'], 2, True, ['implicit_rule','property'], \
                    True if args.variant != 'statement_only' else False),
                ('statement', ['subject'], 2, False, ['statement'], False),
                ('statement', ['predicate'], 2, False, ['statement'],False)], tar_tag='distractors')
            if True:  # Change condition to config flag if need be
                # Change implicit distractor to be the negative statement for main subject
                # E.g., instead of "salmon has eye" distractor for "house has door", we make it "house does not have eye"
                for e in examples:
                    dist = copy.deepcopy(e['distractors']['implicit_rule'][0])
                    dist['subject'] = e['implicit_rule']['subject']
                    dist['validity'] = 'never true'
                    e['distractors']['implicit_rule'] = [dist]

            # mixing 10% of the implicit rule as statement in the training mix only.
            if args.variant == 'training_mix':
                examples += [{
                    'statement': e['implicit_rule']
                } for e in random.sample(bridge_rules,
                                         int(0.05 * float(len(examples))))]
                negative_bridge = self.self_negative_subject_sample(
                    bridge_rules,
                    sample_on='implicit_rule',
                    avoid_mixing=['co-meronyms'])
                examples += [{
                    'statement': e['implicit_rule']
                } for e in random.sample(negative_bridge,
                                         int(0.05 * float(len(examples))))]

            # for each variation, the proportion in which each rule type will be filtered.
            ablations = {
                'training_mix': [(['implicit_rule'], 0.5), (['property'], 0),
                                 (['distractors'], 0.2)],
                'statement_only': [(['implicit_rule'], 1), (['property'], 1)],
                'explicit_only': [(['implicit_rule'], 0), (['property'], 0)],
                'corrected_only': [(['implicit_rule'], 0), (['property'], 0)],
                'corrected_inv_only': [(['implicit_rule'], 0),
                                       (['property'], 0)],
                'implicit_only': [(['implicit_rule'], 1), (['property'], 0)],
                'statement_only_no_context': [(['implicit_rule'], 1),
                                              (['property'], 1),
                                              (['distractors'], 1)],
                'statement_subject_lang_selectivity': [(['implicit_rule'], 1),
                                                       (['property'], 0)],
                'implicit_knowledge_test': [(['statement'], 1),
                                            (['implicit_rule'], 0),
                                            (['property'], 1),
                                            (['distractors'], 1)],
                'implicit_knowledge_distractor_test': [(['statement'], 1),
                                                       (['implicit_rule'], 1),
                                                       (['property'], 1),
                                                       (['distractors'], 1)],
            }

            if args.variant == "corrected_only" or args.variant == "corrected_inv_only":
                # In these variants we update the examples to delete implicit_rule edges according to
                # absence ("corrected_only") or presence ("corrected_inv_only") in self._incorrect_beliefs
                # So "corrected_only" means that context include implicit rules that a model got wrong.
                def edge_tuple(e):
                    return (e['subject'], e['predicate'], e['object'],
                            e['validity'])

                incorrect_beliefs_set = {
                    edge_tuple(e)
                    for e in self._incorrect_beliefs
                }

                def delete_edge(e):
                    tuple = edge_tuple(e)
                    if args.variant == "corrected_only":
                        return tuple not in incorrect_beliefs_set
                    else:
                        return tuple in incorrect_beliefs_set

                for e in examples:
                    if delete_edge(e['implicit_rule']):
                        del e['implicit_rule']
                    dist = e['distractors']
                    if delete_edge(dist['implicit_rule'][0]):
                        del dist['implicit_rule']
            if args.variant == 'implicit_knowledge_test':
                self.build_statement_rule_property_examples(
                    examples,
                    split=split,
                    statement_tag='implicit_rule',
                    rule_tags=[],
                    distractor_tags=[])
            elif args.variant == 'implicit_knowledge_distractor_test':
                for e in examples:
                    e['distractor_implicit_rule'] = copy.deepcopy(
                        e['implicit_rule'])
                    e['distractor_implicit_rule']['object'] = e['distractors'][
                        'implicit_rule'][0]['object']
                    e['distractor_implicit_rule']['validity'] = 'never true'
                self.build_statement_rule_property_examples(
                    examples,
                    split=split,
                    statement_tag='distractor_implicit_rule',
                    rule_tags=[],
                    distractor_tags=[])
            else:
                self.build_statement_rule_property_examples(
                    examples,
                    split=split,
                    ablation_list=ablations[args.variant])

        self.print_examples(20)
        self.print_stats()
        self.examples_meta = pd.DataFrame(self.examples_meta)
        self.save_dataset()
Пример #7
0
    def build_artificial_dataset(self, args):
        # examples_meta is a pandas DataFrame that contain all examples with additional meta data for
        # the task, and will be automatically save as "..._meta.jsonl" file with the artiset files
        self.examples_meta = []
        random.seed(17)

        logger.info("building examples")

        # Querying all functional relations from KB.
        func_rel_triplets = TeachAIKB().sample({'predicate': ['super bowl loser', 'super bowl winner', 'band member', 'capital', \
                                                              'director', 'release year', 'founder', 'headquarter', 'child', 'spouse',
                                                              'CEO'], \
                                                'source': ['wikidata']},
                                               select=['subject', 'predicate', 'object', 'validity'])

        # split dev and train
        function_relations_split = {'dev': [], 'train': []}
        for e in func_rel_triplets:
            if sum(bytearray(e['object'].encode())) % 7 == 0 or e['object'] in ['Germany', 'United States of America', 'Israel']:
                function_relations_split['dev'].append(e)
            else:
                function_relations_split['train'].append(e)

        for split, function_relations in function_relations_split.items():
            if args.split is not None and split != args.split:
                continue

            logger.info(f'---------------  {split} ------------------')

            function_relations_neg = self.self_negative_subject_sample(function_relations, avoid_mixing='predicates', \
                                                                       over_sample=2)

            # In counting we have "counted instances" which are the actual instances to be counted
            # these are collected using aggregation of predicate and object.
            counted_instances = pd.DataFrame(function_relations).groupby(['predicate', 'object']).apply(lambda x: \
                                                                                                            (x.to_dict(orient='rows'),
                                                                                                             len(x)))
            false_statements = pd.DataFrame(function_relations_neg).groupby(['predicate', 'object']).apply(lambda x: \
                                                                                                               x.to_dict(orient='rows'))

            examples = self.building_balanced_counting_examples(counted_instances, false_statements, split)

            # creating the statement
            # creating implicit_rule (implicit rules in counting are "entitiy is a company")
            # examples = TeachAIKB().connect(connect_to=statements_with_instances,
            #                                                 constraints={'validity': 'always true', 'predicate': ['hypernym']}, \
            #                                                 src_tags=['statement'], connection_point=[{'object': 'subject'}],
            #                                                 tar_tag='implicit_rule')

            # creating the hypernym count property (every company has 2 founders)
            # for e in examples:
            #    e['hypernym_count_property'] = {'subject': e['implicit_rule']['object'], 'predicate': 'has 1',
            #                         'object': e['statement']['predicate'], 'validity': 'always true'}

            if args.variant in ['training_mix_lv_oversample_neg','training_mix_lv_oversample','training_mix_oversample']:
                examples = pd.Series(examples).sample(frac=1.5, random_state=17, replace=True).to_list()


            # Sampling distractors.  "[(src_tag, src_fields, num_to_sample, exactly_sample_num, fields_to_take, balance_with_statement)]
            if args.variant in ['training_mix_lv', 'training_mix_lv_oversample', 'training_mix_lv_oversample_neg','training_mix_oversample']:
                self.sample_distractors(examples, tar_tag='distractors', sample=[
                    ('statement', ['predicate'], 1, True, ['counted_instances', 'total_count_rule'], None),
                    ('statement', [], 2, False, ['counted_instances', 'total_count_rule'], None)])
            else:
                self.sample_distractors(examples, tar_tag='distractors', sample=[
                    ('statement', ['predicate'], 2, True, ['counted_instances', 'total_count_rule'], False),
                    ('statement', [], 2, False, ['counted_instances', 'total_count_rule'], False)])
            # ('implicit_rule', ['predicate'], 2, False, ['implicit_rule'], False)])

            # Hypothetical statements vs real fact statements
            hypothetical_portion = {'training_mix': 0, 'training_mix_with_hypothetical': 0.3, 'hypothetical_only': 1, \
                                    'hypothetical_true_label': 1}
            if args.variant not in hypothetical_portion:
                hypothetical_portion[args.variant] = 0
            # hypothtical_inds = random.sample(range(len(examples)), int(hypothetical_portion[args.variant] * len(examples)))
            hypothtical_inds = pd.Series(range(len(examples))).sample(frac=hypothetical_portion[args.variant], random_state=17).to_list()
            for i, e in enumerate(examples):
                if i in hypothtical_inds:
                    e['is_hypothetical_statement'] = True
                    # In the hypothetical case, when the instance count is strictly less than the total number of instances,
                    # the hypothetical_statement becomes true.
                    if len(e['counted_instances']) < e['total_num_of_instances'] and not args.variant == 'hypothetical_true_label':
                        e['statement']['validity'] = 'always true'
                else:
                    e['is_hypothetical_statement'] = False

            # mixing 10% of the statements with no context in the training mix only.
            if args.variant in ['training_mix', 'training_mix_lv', 'training_mix_lv_oversample','training_mix_lv_oversample_neg',\
                                'training_mix_with_hypothetical','training_mix_oversample']:
                examples += [{'statement': e['statement']} for e in random.sample([e for e in examples \
                                if e['statement']['validity'] == 'never true'], int(0.05 * float(len(examples))))]
                examples += [{'statement': e['statement']} for e in random.sample([e for e in examples \
                                if e['statement']['validity'] == 'always true'], int(0.05 * float(len(examples))))]

            # for each variation, the proportion in which each rule type will be filtered.
            ablations = {
                'training_mix': [(['distractors'], 0.2), (['total_count_rule'], 0.05)],
                'training_mix_no_total': [(['distractors'], 0.2), (['total_count_rule'], 1)],
                'no_total': [(['total_count_rule'], 1)],
                'training_mix_lv': [(['distractors'], 0.2), (['total_count_rule'], 0.05)],
                'training_mix_oversample': [(['distractors'], 0.2), (['total_count_rule'], 0.05)],
                'training_mix_lv_oversample': [(['distractors'], 0.2), (['total_count_rule'], 0.05)],
                'training_mix_lv_oversample_neg': [(['distractors'], 0.2), (['total_count_rule'], 0.05)],
                'training_mix_with_hypothetical': [(['distractors'], 0.2), (['total_count_rule'], 0.05)],
                'statement_only': [(['counted_instances', 'total_count_rule'], 1), (['distractors'], 1)],
                'statement_only_with_distractors': [(['counted_instances', 'total_count_rule'], 1)],
            }
            if args.variant not in ablations:
                ablations[args.variant] = []

            if args.variant in ['no_total','training_mix_no_total']:
                for e in examples:
                    del e['distractors']['total_count_rule']

            # Actively splitting between test and dev (50/50)
            if split == 'dev':
                # making sure that for the same amount of examples the split will always be the same.
                random.seed(71)
                all_inds = [i for i in range(len(examples))]
                dev_inds = random.sample(all_inds, int(len(all_inds) / 2))
                test_inds = list(set(all_inds) - set(dev_inds))
                splits = [('dev', [examples[i] for i in dev_inds]),
                          ('test', [examples[i] for i in test_inds])]
            else:
                splits = [('train', examples)]

            for final_split, final_examples in splits:
                if args.experiment_version in ['count_reached','count_reached_mix']:
                    final_examples = [e for e in final_examples if len(e['counted_instances']) == e['total_num_of_instances']]
                elif args.experiment_version in ['count_not_reached']:
                    final_examples = [e for e in final_examples if len(e['counted_instances']) < e['total_num_of_instances']]
                elif args.experiment_version == 'between_one_and_full_counted_instances':
                    final_examples = [e for e in final_examples if len(e['counted_instances']) > 0 and \
                                len(e['counted_instances']) < e['total_num_of_instances']]


                self.build_statement_rule_property_examples(final_examples, split=final_split, \
                    rule_tags=['counted_instances', 'total_count_rule'],
                    ablate_same_distractor_fields=False, \
                    nlg_sampling=True if args.variant in ['training_mix_lv','training_mix_lv_oversample',\
                                                          'training_mix_lv_oversample_neg'] else False,\
                    ablation_list=ablations[args.variant], use_shorthand=True, \
                    reverse_validity_frac=0.3 if args.variant == 'training_mix_lv_oversample_neg' else 0)


        self.print_examples(30)
        self.print_stats()
        self.examples_meta = pd.DataFrame(self.examples_meta)
        self.save_dataset()