def _create_filtered_index(self,
                               source=dir_path + '../data/character_index.csv',
                               destination=dir_path +
                               '../data/character_index_filtered.csv'):
        with io.open(source,
                     'rb') as fin_index, io.open(destination,
                                                 'w',
                                                 encoding='utf8') as fout:
            total_lines_relations = line_counting.cached_counter.count_lines(
                self.path_relations)
            self.logger.print_info('Collecting important entities...')
            important_articles = set()
            nt_reader = NTReader(self.path_relations)
            for subject, predicate, object in tqdm(
                    nt_reader.yield_cleaned_entry_names(),
                    total=total_lines_relations):
                important_articles.add(subject)

            total_lines_index = line_counting.cached_counter.count_lines(
                source)
            self.logger.print_info('Filtering important entities...')
            index_reader = csv.reader(fin_index,
                                      delimiter=self.delimiter,
                                      encoding='utf-8',
                                      quoting=csv.QUOTE_NONE)
            for line in tqdm(index_reader, total=total_lines_index):
                subject, character_offset = line
                if subject in important_articles:
                    fout.write(subject + self.delimiter + character_offset +
                               '\n')
Пример #2
0
    def __init__(self,
                 articles_limit,
                 use_dump=False,
                 randomize=False,
                 match_threshold=0.005,
                 type_matching=True,
                 allow_unknown_entity_types=True,
                 print_interim_results=True,
                 threads=4,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 patterns_input_path=dir_path + '../data/patterns_cleaned.pkl',
                 facts_output_path=dir_path + '../results/extracted_facts.nt',
                 extended_facts_output_path=dir_path +
                 '../results/extracted_facts_extended.txt'):
        super(FactExtractor, self).__init__(patterns_input_path)
        self.articles_limit = articles_limit
        self.use_dump = use_dump
        self.allow_unknown_entity_types = allow_unknown_entity_types
        self.match_threshold = match_threshold
        self.type_matching = type_matching
        self.nt_reader = NTReader(resources_path, randomize)
        self.wikipedia_connector = WikipediaConnector(self.use_dump)
        self.pattern_extractor = PatternExtractor()
        self.pattern_matcher = PatternMatcher()
        self.print_interim_results = print_interim_results
        self.discovery_resources = set()
        self.extracted_facts = []
        self.threads = threads
        self.nt_writer = NTWriter(facts_output_path)
        self.extended_facts_output_path = extended_facts_output_path

        # self._make_pattern_types_transitive()
        self._load_discovery_resources()
Пример #3
0
 def __init__(self,
              dbpedia_facts_path=dir_path +
              '../data/mappingbased_objects_en.ttl',
              facts_input_path=dir_path + '../results/extracted_facts.nt',
              facts_output_path=dir_path + '../results/new_facts.nt'):
     self.dbpedia_nt_reader = NTReader(dbpedia_facts_path)
     self.extracted_facts_nt_reader = NTReader(facts_input_path)
     self.nt_writer = NTWriter(facts_output_path)
 def __init__(self,
              resources_path=dir_path +
              '../data/mappingbased_objects_en.ttl',
              facts_limit=100000):
     # self.instance_types = EntityTypes(types_paths=["../data/types_en.csv"], types_index=False,
     #          types_indexed_file=False)
     self.instance_types = EntityTypes()
     self.resources_path = resources_path
     self.nt_reader = NTReader(resources_path, False)
     self.logger = Logger.from_config_file()
     self.delimiter = '#'
     self.predicates = dict()
     self.facts_limit = facts_limit
Пример #5
0
 def __init__(self,
              facts_path=dir_path + '../data/mappingbased_objects_en.ttl',
              output_path=dir_path + '../data/type_patterns_raw.pkl',
              facts_limit=False):
     super(TypeLearner, self).__init__(None, output_path)
     self.facts_path = facts_path
     self.output_path = output_path
     self.facts_limit = facts_limit if facts_limit > 0 else sys.maxint
     self.nt_reader = NTReader(facts_path)
     self.instance_types = EntityTypes()
     self.subjects = dict()
     self.objects = dict()
     self.type_patterns = dict()
Пример #6
0
    def __init__(self,
                 facts_limit,
                 randomize=False,
                 ground_truth_path=dir_path +
                 '../pattern_testing/ground_truth.ttl'):
        self.facts_limit = facts_limit
        self.randomize = randomize
        self.nt_reader = NTReader(ground_truth_path, randomize)
        self.logger = Logger.from_config_file()
        self.results = {}
        self.fact_extractor = None

        # count known, right and wrong facts for each relation_type
        self.known_facts_counter = Counter()
        self.right_facts_counter = Counter()
        self.wrong_facts_counter = Counter()
Пример #7
0
class FactCleaner(object):
    def __init__(self,
                 dbpedia_facts_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 facts_input_path=dir_path + '../results/extracted_facts.nt',
                 facts_output_path=dir_path + '../results/new_facts.nt'):
        self.dbpedia_nt_reader = NTReader(dbpedia_facts_path)
        self.extracted_facts_nt_reader = NTReader(facts_input_path)
        self.nt_writer = NTWriter(facts_output_path)

    def clean_facts(self):
        dbpedia_facts = set()
        for subject, predicate, object in self.dbpedia_nt_reader.yield_entries(
        ):
            dbpedia_facts.add((subject, predicate, object))

        extracted_facts = set()
        for subject, predicate, object in self.extracted_facts_nt_reader.yield_entries(
        ):
            extracted_facts.add((subject, predicate, object))

        cleaned_facts = extracted_facts - dbpedia_facts
        self.nt_writer.write_nt(cleaned_facts)
    def __init__(self,
                 relation_types_limit,
                 facts_limit,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 relation_types=None,
                 use_dump=False,
                 randomize=False,
                 perform_tests=False,
                 type_learning=True,
                 replace_redirects=False,
                 patterns_output_path=dir_path + '../data/patterns_raw.pkl',
                 threads=4):
        super(WikipediaPatternExtractor,
              self).__init__(None, patterns_output_path)
        self.use_dump = use_dump
        self.facts_limit = facts_limit
        self.perform_tests = perform_tests
        self.type_learning = type_learning
        self.wikipedia_connector = WikipediaConnector(
            use_dump=self.use_dump, redirect=replace_redirects)
        self.pattern_extractor = PatternExtractor()
        self.num_of_threads = threads
        self.nt_reader = NTReader(resources_path, randomize)
        self.logger = Logger.from_config_file()

        if relation_types is not None and len(relation_types) > 0:
            self.relation_types = [
                'http://dbpedia.org/ontology/' + r for r in relation_types if r
            ]
            self.relation_types_limit = len(self.relation_types)
        else:
            self.relation_types = None  # means any relation may be learned
            self.relation_types_limit = relation_types_limit

        self.dbpedia = {}
        self.matches = []
Пример #9
0
class FactExtractor(PatternTool):
    def __init__(self,
                 articles_limit,
                 use_dump=False,
                 randomize=False,
                 match_threshold=0.005,
                 type_matching=True,
                 allow_unknown_entity_types=True,
                 print_interim_results=True,
                 threads=4,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 patterns_input_path=dir_path + '../data/patterns_cleaned.pkl',
                 facts_output_path=dir_path + '../results/extracted_facts.nt',
                 extended_facts_output_path=dir_path +
                 '../results/extracted_facts_extended.txt'):
        super(FactExtractor, self).__init__(patterns_input_path)
        self.articles_limit = articles_limit
        self.use_dump = use_dump
        self.allow_unknown_entity_types = allow_unknown_entity_types
        self.match_threshold = match_threshold
        self.type_matching = type_matching
        self.nt_reader = NTReader(resources_path, randomize)
        self.wikipedia_connector = WikipediaConnector(self.use_dump)
        self.pattern_extractor = PatternExtractor()
        self.pattern_matcher = PatternMatcher()
        self.print_interim_results = print_interim_results
        self.discovery_resources = set()
        self.extracted_facts = []
        self.threads = threads
        self.nt_writer = NTWriter(facts_output_path)
        self.extended_facts_output_path = extended_facts_output_path

        # self._make_pattern_types_transitive()
        self._load_discovery_resources()

    @classmethod
    def from_config_file(cls):
        config_parser = cls.get_config_parser()
        use_dump = config_parser.getboolean('general', 'use_dump')
        randomize = config_parser.getboolean('fact_extractor', 'randomize')
        articles_limit = config_parser.getint('fact_extractor',
                                              'articles_limit')
        match_threshold = config_parser.getfloat('fact_extractor',
                                                 'match_threshold')
        type_matching = config_parser.getboolean('fact_extractor',
                                                 'type_matching')
        allow_unknown_entity_types = config_parser.getboolean(
            'fact_extractor', 'allow_unknown_entity_types')
        num_of_threads = config_parser.getint('fact_extractor', 'threads')
        return cls(articles_limit,
                   use_dump,
                   randomize,
                   match_threshold,
                   type_matching,
                   allow_unknown_entity_types,
                   threads=num_of_threads)

    def _make_pattern_types_transitive(self):
        for relation, pattern in self.relation_type_patterns.iteritems():
            pattern.subject_type_frequencies = self.pattern_extractor \
                .get_transitive_types(pattern.subject_type_frequencies)
            pattern.object_type_frequencies = self.pattern_extractor \
                .get_transitive_types(pattern.object_type_frequencies)

    @staticmethod
    def flat_map(list_of_lists):
        return [item for list in list_of_lists for item in list]

    def _load_discovery_resources(self):
        article_counter = 0
        valid_types = set(
            FactExtractor.flat_map(
                self._get_specific_type_frequencies('subject').values()))

        self.logger.print_info('Collecting entities for fact extraction...')
        for subject, predicate, object in self.nt_reader.yield_entries():
            if article_counter == self.articles_limit:
                break
            if subject in self.training_resources or subject in self.discovery_resources:
                continue
            subject_types = set(
                self.pattern_extractor.get_entity_types(subject).keys())
            if (self.allow_unknown_entity_types and len(subject_types) == 0) \
                    or len(subject_types & valid_types) > 0:
                self.discovery_resources.add(subject)
                article_counter += 1

        self.logger.print_done(
            'Collecting entities for fact extraction completed: ' +
            str(len(self.discovery_resources)) + ' articles')

    def _match_pattern_against_relation_type_patterns(self, pattern,
                                                      reasonable_relations):
        matching_relations = []
        for relation in reasonable_relations:
            relation_pattern = self.relation_type_patterns[relation]
            match_score = self.pattern_matcher.match_patterns(
                relation, relation_pattern, pattern, self.type_matching,
                self.allow_unknown_entity_types)
            if match_score >= self.match_threshold:
                matching_relations.append((relation, match_score))
        return matching_relations

    def _filter_reasonable_relations(self, entity, types_of_relations):
        reasonable_relations = set()
        entity_types = self.pattern_extractor.get_entity_types(entity)
        if self.allow_unknown_entity_types and len(entity_types) == 0:
            reasonable_relations = set(types_of_relations.keys())
        else:
            for relation, types in types_of_relations.iteritems():
                assert types is not None
                # Otherwise types were not learned in the training step.
                # In this case you probably have to adjust the config file and rerun the training step.
                if len(entity_types & types) > 0:
                    reasonable_relations.add(relation)
        return reasonable_relations

    def _get_specific_type_frequencies(self, subject_or_object):
        if subject_or_object == 'subject':
            return {
                relation: pattern.subject_type_frequencies
                for relation, pattern in
                self.relation_type_patterns.iteritems()
            }
        elif subject_or_object == 'object':
            return {
                relation: pattern.object_type_frequencies
                for relation, pattern in
                self.relation_type_patterns.iteritems()
            }
        else:
            assert False

    def _extract_facts_from_sentences(self, sentences, subject_entity=None):
        facts = []
        if self.type_matching:
            reasonable_relations_for_subject = self._filter_reasonable_relations(
                subject_entity, self._get_specific_type_frequencies('subject'))
        for sentence in sentences:
            if sentence.number_of_tokens() > 50:
                continue  # probably too long for stanford tokenizer
            relative_position = sentence.relative_pos
            nl_sentence = sentence.as_string()
            object_addresses_of_links = sentence.addresses_of_dbpedia_links()
            for object_link, object_addresses in object_addresses_of_links.iteritems(
            ):
                object_entity = uri_rewriting.strip_name(object_link)
                if self.type_matching:
                    reasonable_relations_for_object = self._filter_reasonable_relations(
                        object_entity,
                        self._get_specific_type_frequencies('object'))
                    reasonable_relations = reasonable_relations_for_subject & reasonable_relations_for_object
                else:
                    reasonable_relations = self.relation_type_patterns

                if not len(reasonable_relations):
                    continue

                pattern = self.pattern_extractor.extract_pattern(
                    nl_sentence, object_addresses, relative_position,
                    self.type_matching, subject_entity, object_entity)
                if pattern is None:
                    continue

                matching_relations = self._match_pattern_against_relation_type_patterns(
                    pattern, reasonable_relations)
                new_facts = [(predicate, object_link, score, nl_sentence)
                             for (predicate, score) in matching_relations]
                facts.extend(new_facts)
        return facts

    def extract_facts_from_html(self, html, resource):
        tagged_sentences = TaggedSentence.from_html(html)
        referenced_sentences = filter(lambda sent: sent.contains_any_link(),
                                      tagged_sentences)
        if self.type_matching:
            subject_entity = uri_rewriting.strip_name(resource)
        else:
            subject_entity = None
        facts = self._extract_facts_from_sentences(referenced_sentences,
                                                   subject_entity)
        facts = [(resource, predicate, object, score, nl_sentence)
                 for (predicate, object, score, nl_sentence) in facts]
        if self.print_interim_results:
            for fact in facts:
                print(fact)
        return facts

    def _extract_facts_from_resource(self, chunk=None):
        self.logger.print_info('--- start fact extraction thread ----')
        if chunk is None:
            chunk = set()
        facts = []
        for resource in chunk:
            wikipedia_resource = uri_rewriting.convert_to_wikipedia_resource_uri(
                resource)
            self.logger.print_info('--- ' + wikipedia_resource + ' ----')
            html = self.wikipedia_connector.get_wikipedia_article_html(
                resource)
            facts.extend(self.extract_facts_from_html(html, resource))

        self.extracted_facts.extend(facts)

    @staticmethod
    def _chunks(data, size=10000):
        """ Yield successive n-sized chunks from input. """
        for i in range(0, len(data), size):
            yield data[i:i + size]

    def _remove_dead_objects(self):
        self.extracted_facts = filter(
            lambda (subject, predicate, object, score, nl_sentence):
            'redlink=1' not in object, self.extracted_facts)

    def extract_facts(self):
        self.logger.print_info('Fact extraction...')
        chunk_size = int(ceil(len(self.discovery_resources) / self.threads))
        threads = []
        # gather resources for each thread
        for chunk in self._chunks(list(self.discovery_resources), chunk_size):
            t = Thread(target=self._extract_facts_from_resource,
                       kwargs={'chunk': chunk})
            threads.append(t)
            # start all threads
        for t in threads:
            t.start()
        # wait for all threads to finish
        for t in threads:
            t.join()
        self._remove_dead_objects()
        self.extracted_facts.sort(key=lambda fact: fact[0][3], reverse=True)
        self.logger.print_done('Fact extraction completed')

    def save_extracted_facts(self):
        short_facts = [(subject, predicate, object)
                       for (subject, predicate, object, socre,
                            nl_sentence) in self.extracted_facts]
        self.nt_writer.write_nt(short_facts)

        with codecs.open(self.extended_facts_output_path, 'wb',
                         'utf-8') as fout:
            self.logger.print_info('\n\nSaving extended facts to "' +
                                   self.extended_facts_output_path + '"...')
            for fact in tqdm(self.extracted_facts):
                fout.write(str(fact) + '\n')

    @property
    def training_relation_types(self):
        return self.relation_type_patterns.keys()

    def set_print_interim_results(self, boolean):
        self.print_interim_results = boolean
Пример #10
0
class PatternTester(ConfigInitializer):
    def __init__(self,
                 facts_limit,
                 randomize=False,
                 ground_truth_path=dir_path +
                 '../pattern_testing/ground_truth.ttl'):
        self.facts_limit = facts_limit
        self.randomize = randomize
        self.nt_reader = NTReader(ground_truth_path, randomize)
        self.logger = Logger.from_config_file()
        self.results = {}
        self.fact_extractor = None

        # count known, right and wrong facts for each relation_type
        self.known_facts_counter = Counter()
        self.right_facts_counter = Counter()
        self.wrong_facts_counter = Counter()

    @classmethod
    def from_config_file(cls):
        config_parser = cls.get_config_parser()
        facts_limit = config_parser.getint('pattern_testing', 'facts_limit')
        randomize = config_parser.getboolean('pattern_testing', 'randomize')
        return cls(facts_limit, randomize)

    def _collect_testing_facts(self):
        if self.fact_extractor is None:
            self.fact_extractor = FactExtractor.from_config_file()
            self.fact_extractor.set_print_interim_results(False)

        training_resources = self.fact_extractor.training_resources
        training_relations = self.fact_extractor.training_relation_types
        entities = dict()
        fact_counter = 0

        self.logger.print_info('Collecting facts for testing...')
        for subject, predicate, object in self.nt_reader.yield_entries():
            if fact_counter == self.facts_limit * len(training_relations):
                break
            if subject in training_resources:
                self.logger.print_error(
                    'Resource: "' + subject +
                    '" was already used for training and thus won\'t be used for testing'
                )
                continue
            if predicate not in training_relations:
                continue
            if self.known_facts_counter[predicate] == self.facts_limit:
                continue

            # maintain a dict for each entity with given relations as key
            # and their target values as list
            entities.setdefault(subject, []).append((predicate, object))
            self.known_facts_counter[predicate] += 1
            fact_counter += 1

        return entities

    def get_testing_resources(self):
        return set([
            subject
            for subject, predicate, object in self.nt_reader.yield_entries()
        ])

    def test_patterns(self):
        test_entities = self._collect_testing_facts()
        self.fact_extractor.discovery_resources = test_entities
        self.fact_extractor.extract_facts()

        for fact in self.fact_extractor.extracted_facts:
            print(fact)
            subject, predicate, object, score, nl_sentence = fact
            if (predicate, object) in test_entities[subject]:
                self.right_facts_counter[predicate] += 1
                print('Match')
            else:
                self.wrong_facts_counter[predicate] += 1
                print('No match')
            print('')

    @staticmethod
    def _calculate_f_measure(precision, recall):
        if precision is None or recall is None or precision + recall == 0:
            return None
        numerator = 2 * (precision * recall)
        return numerator / (precision + recall)

    @staticmethod
    def _soft_division(dividend, divisor):
        try:
            return dividend / float(divisor)
        except ZeroDivisionError:
            return None

    @staticmethod
    def _calculate_precision_recall_and_f_measure(total, right, wrong):
        precision = PatternTester._soft_division(right, right + wrong)
        recall = PatternTester._soft_division(right, total)
        f_measure = PatternTester._calculate_f_measure(precision, recall)
        return precision, recall, f_measure

    def print_results(self):
        for relation_type in self.fact_extractor.training_relation_types:
            total = self.known_facts_counter[relation_type]
            right = self.right_facts_counter[relation_type]
            wrong = self.wrong_facts_counter[relation_type]
            precision, recall, f_measure = PatternTester._calculate_precision_recall_and_f_measure(
                total, right, wrong)
            print(relation_type + ' Known facts:' + str(total) + ' Right:' +
                  str(right) + ' Wrong:' + str(wrong) + ' Precision:' +
                  str(precision) + ' Recall:' + str(recall) + ' F-Measure:' +
                  str(f_measure))
Пример #11
0
class StatisticGenerator(object):
    def __init__(self,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 facts_limit=100000):
        # self.instance_types = EntityTypes(types_paths=["../data/types_en.csv"], types_index=False,
        #          types_indexed_file=False)
        self.instance_types = EntityTypes()
        self.resources_path = resources_path
        self.nt_reader = NTReader(resources_path, False)
        self.logger = Logger.from_config_file()
        self.delimiter = '#'
        self.predicates = dict()
        self.facts_limit = facts_limit

    def collect_predicates(self, facts_limit=100000):
        self.facts_limit = facts_limit
        self.predicates = dict()
        total_count = 0
        total_lines = min(
            line_counting.cached_counter.count_lines(self.resources_path),
            self.facts_limit)

        self.logger.print_info('Collecting facts for each predicate...')
        for subject, predicate, object in tqdm(self.nt_reader.yield_entries(),
                                               total=total_lines):
            total_count += 1
            if total_count > self.facts_limit:
                break

            subject = uri_rewriting.strip_cleaned_name(subject)
            object = uri_rewriting.strip_cleaned_name(object)

            self.predicates.setdefault(predicate,
                                       {}).setdefault(subject,
                                                      []).append(object)

    def count_types(self):
        subject_counts = []
        object_counts = []
        has_both = 0
        has_exact_one = 0
        has_nothing = 0
        facts = 0
        outlier_threshold = 100

        for predicate in tqdm(self.predicates, total=len(self.predicates)):
            for subject in self.predicates[predicate]:
                subject_types = self.instance_types.get_types(subject)

                for object in self.predicates[predicate][subject]:
                    object_types = self.instance_types.get_types(object)

                    facts += 1
                    if subject_types:
                        if len(subject_types) < outlier_threshold:
                            subject_counts.append(len(subject_types))
                    if object_types:
                        if len(object_types) < outlier_threshold:
                            object_counts.append(len(object_types))
                    if subject_types and object_types:
                        has_both += 1
                    if not subject_types and not object_types:
                        has_nothing += 1
                    if (len(subject_types) > 0) ^ (len(object_types) > 0):
                        has_exact_one += 1

        subject_counts = pd.Series(subject_counts)
        # subject_counts.plot.hist(bins=100)
        # plt.show()
        object_counts = pd.Series(object_counts)
        # object_counts.plot.hist(bins=100)
        # plt.show()
        self.logger.print_info('Facts: ' + str(facts))
        self.logger.print_info('With subject type: ' +
                               str(subject_counts.count()))
        self.logger.print_info('Mean subject type count: ' +
                               str(subject_counts.mean()))
        self.logger.print_info('Standard deviation subject type count: ' +
                               str(subject_counts.std()))
        self.logger.print_info('With object type: ' +
                               str(object_counts.count()))
        self.logger.print_info('Mean object type count: ' +
                               str(object_counts.mean()))
        self.logger.print_info('Standard deviation object type count: ' +
                               str(object_counts.std()))
        self.logger.print_info('Both with type(s): ' + str(has_both))
        self.logger.print_info('Exact one with type(s): ' + str(has_exact_one))
        self.logger.print_info('None with type(s): ' + str(has_nothing))

    def test_types_independence(self, expectation_threshold=10):
        variances = {}
        total_included_count = 0
        sum_avg_variance = 0
        empty_token = '#empty'

        self.logger.print_info(
            'Collecting subject and object types for each predicate and calculating independence score...'
        )
        for predicate in tqdm(self.predicates, total=len(self.predicates)):
            predicate_count = 0
            predicate_subject_types = Counter()
            predicate_object_types = Counter()
            combinations = Counter()
            for subject in self.predicates[predicate]:
                subject_types = self.instance_types.get_types(subject).append(
                    empty_token)
                for object in self.predicates[predicate][subject]:
                    # TODO: check for occurrence in Wikipedia article
                    # TODO: exclude double underscores
                    predicate_count += 1
                    object_types = self.instance_types.get_types(object)
                    predicate_subject_types.update(subject_types)
                    predicate_object_types.update(object_types)
                    cross_product = [(s, o) for s in subject_types
                                     for o in object_types]
                    combinations.update(cross_product)

            # print(predicate)
            variance = StatisticGenerator.calculate_independence_score(
                predicate_count, predicate_subject_types,
                predicate_object_types, combinations, expectation_threshold)
            if variance is None:
                continue

            variances[predicate] = variance
            sum_avg_variance += float(predicate_count) * variance
            total_included_count += predicate_count

        total_avg_variance = sum_avg_variance / total_included_count

        with open("types_independence_" + str(int(time.time())) + ".csv",
                  'wb') as csv_file:
            writer = unicodecsv.writer(csv_file, delimiter=self.delimiter)
            writer.writerow([
                "Threshold", expectation_threshold, "Facts count",
                self.facts_limit, "Avg variance", total_avg_variance
            ])
            for predicate, variance in sorted(variances.items(),
                                              key=operator.itemgetter(1)):
                writer.writerow([predicate, variance])
                print(predicate, " ", variance)

    def measure_type_diversity(self, threshold=2):
        subject_types_count = Counter()
        object_types_count = Counter()
        relation_subject_types = {}
        relation_object_types = {}

        facts = Counter()

        for predicate in tqdm(self.predicates, total=len(self.predicates)):
            relation_subject_types[predicate] = Counter()
            relation_object_types[predicate] = Counter()
            for subject in self.predicates[predicate]:
                subject_types = self.instance_types.get_types(subject)

                for object in self.predicates[predicate][subject]:
                    object_types = self.instance_types.get_types(object)

                    facts[predicate] += 1
                    for subject_type in subject_types:
                        relation_subject_types[predicate][subject_type] += 1
                        subject_types_count[subject_type] += 1
                    for object_type in object_types:
                        relation_object_types[predicate][object_type] += 1
                        object_types_count[object_type] += 1
                        # print(predicate, subject, object)

        subject_specs = StatisticGenerator.calculate_specifity(
            facts, subject_types_count, relation_subject_types)
        object_specs = StatisticGenerator.calculate_specifity(
            facts, object_types_count, relation_object_types)
        both_specs = {}
        for predicate in subject_specs:
            both_specs[predicate] = {}
            both_specs[predicate]["subject"] = subject_specs[predicate]
        for predicate in object_specs:
            both_specs.setdefault(predicate, {})
            both_specs[predicate]["object"] = object_specs[predicate]
        for predicate in both_specs:
            print(';'.join([
                predicate,
                str(both_specs[predicate].setdefault("subject", -1)),
                str(both_specs[predicate].setdefault("object", -1))
            ]))

    @staticmethod
    def calculate_specifity(facts, types, relation_types):
        total_facts = sum(facts.values())
        specifities = {}

        for predicate in relation_types:
            if len(set(relation_types[predicate])) == 0:
                continue

            deviations = 0
            for name, predicate_type_frequency in relation_types[
                    predicate].most_common():
                predicate_relative_frequency = float(
                    predicate_type_frequency) / facts[predicate]
                total_frequency = float(types[name] -
                                        predicate_type_frequency) / total_facts
                # print(name)
                # print(facts[predicate])
                # print(predicate_frequency)
                # print(predicate_relative_frequency)
                # print(total_frequency)
                assert abs(predicate_relative_frequency - total_frequency) <= 1
                deviations += predicate_type_frequency * abs(
                    predicate_relative_frequency - total_frequency)
            specifities[predicate] = float(deviations) / sum(
                relation_types[predicate].values())
        return specifities

    @staticmethod
    def calculate_independence_score(facts_count, subject_types, object_types,
                                     combinations, expectation_threshold):
        sum_rel_variance = 0
        included_combination_count = 0

        for combination, observed_count in combinations.most_common():
            subject, object = combination
            expected_count = float(
                subject_types[subject] * object_types[object]) / facts_count
            if expected_count < expectation_threshold:
                continue
            included_combination_count += observed_count
            rel_variance = (float(abs(observed_count - expected_count)) /
                            expected_count)
            sum_rel_variance += observed_count * rel_variance

        if included_combination_count == 0:
            return None
        return sum_rel_variance / included_combination_count
Пример #12
0
class TypeLearner(TypeTool):
    def __init__(self,
                 facts_path=dir_path + '../data/mappingbased_objects_en.ttl',
                 output_path=dir_path + '../data/type_patterns_raw.pkl',
                 facts_limit=False):
        super(TypeLearner, self).__init__(None, output_path)
        self.facts_path = facts_path
        self.output_path = output_path
        self.facts_limit = facts_limit if facts_limit > 0 else sys.maxint
        self.nt_reader = NTReader(facts_path)
        self.instance_types = EntityTypes()
        self.subjects = dict()
        self.objects = dict()
        self.type_patterns = dict()

    @classmethod
    def from_config_file(cls):
        config_parser = cls.get_config_parser()
        section = 'type_learner'
        facts_limit = config_parser.getint(section, 'facts_limit')
        return cls(facts_limit=facts_limit)

    @staticmethod
    def _update_entity_counter(entities, entity, predicate):
        entity = uri_rewriting.strip_cleaned_name(entity)
        entities.setdefault(entity, Counter())
        entities[entity][predicate] += 1

    def _count_predicates(self):
        total_lines = min(
            line_counting.cached_counter.count_lines(self.facts_path),
            self.facts_limit)
        facts_count = 0

        self.logger.print_info(
            'Counting relations for subjects and objects...')
        for subject, predicate, object in tqdm(self.nt_reader.yield_entries(),
                                               total=total_lines):
            facts_count += 1
            if facts_count > self.facts_limit:
                break

            self._update_entity_counter(self.subjects, subject, predicate)
            self._update_entity_counter(self.objects, object, predicate)

            self.type_patterns.setdefault(predicate, TypePattern())
            self.type_patterns[predicate].facts += 1

    def _get_types(self, entities):
        relations = dict()
        for entity in tqdm(entities, total=len(entities)):
            types = self.instance_types.get_types(entity)
            for predicate, quantity in entities[entity].iteritems():
                relations.setdefault(predicate, Counter())
                for type in types:
                    relations[predicate].update({type: quantity})
        return relations

    def _count_types(self):
        self.logger.print_info('Retrieving types for subjects...')
        subject_types = self._get_types(self.subjects)
        self.logger.print_info('Cumulating subject types for relations...')
        for predicate in tqdm(subject_types, total=len(subject_types)):
            self.type_patterns[predicate].subject_types += subject_types[
                predicate]

        self.logger.print_info('Retrieving types for objects...')
        object_types = self._get_types(self.objects)
        self.logger.print_info('Cumulating object types for relations...')
        for predicate in tqdm(object_types, total=len(object_types)):
            self.type_patterns[predicate].object_types += object_types[
                predicate]

    def learn_types(self):
        self.logger.print_info('Type learning...')
        self._count_predicates()
        self._count_types()
        self.logger.print_done('Type learning completed.')
class WikipediaPatternExtractor(PatternTool):
    def __init__(self,
                 relation_types_limit,
                 facts_limit,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 relation_types=None,
                 use_dump=False,
                 randomize=False,
                 perform_tests=False,
                 type_learning=True,
                 replace_redirects=False,
                 patterns_output_path=dir_path + '../data/patterns_raw.pkl',
                 threads=4):
        super(WikipediaPatternExtractor,
              self).__init__(None, patterns_output_path)
        self.use_dump = use_dump
        self.facts_limit = facts_limit
        self.perform_tests = perform_tests
        self.type_learning = type_learning
        self.wikipedia_connector = WikipediaConnector(
            use_dump=self.use_dump, redirect=replace_redirects)
        self.pattern_extractor = PatternExtractor()
        self.num_of_threads = threads
        self.nt_reader = NTReader(resources_path, randomize)
        self.logger = Logger.from_config_file()

        if relation_types is not None and len(relation_types) > 0:
            self.relation_types = [
                'http://dbpedia.org/ontology/' + r for r in relation_types if r
            ]
            self.relation_types_limit = len(self.relation_types)
        else:
            self.relation_types = None  # means any relation may be learned
            self.relation_types_limit = relation_types_limit

        self.dbpedia = {}
        self.matches = []

    @classmethod
    def from_config_file(cls):
        config_parser = cls.get_config_parser()
        use_dump = config_parser.getboolean('general', 'use_dump')

        section = 'wikipedia_pattern_extractor'
        randomize = config_parser.getboolean(section, 'randomize')
        perform_tests = config_parser.getboolean(section, 'randomize')
        relation_types_limit = config_parser.getint(section,
                                                    'relation_types_limit')
        facts_limit = config_parser.getint(section, 'facts_limit')
        replace_redirects = config_parser.getboolean(section,
                                                     'replace_redirects')
        type_learning = config_parser.getboolean(section, 'type_learning')
        threads = config_parser.getint(section, 'threads')

        relation_types = config_parser.get(section, 'relation_types')
        relation_types = WikipediaPatternExtractor.split_string_list(
            relation_types)
        relation_types = filter(lambda rt: rt != '' and ';' not in rt,
                                relation_types)  # filter comments

        return cls(relation_types_limit,
                   facts_limit,
                   relation_types=relation_types,
                   use_dump=use_dump,
                   randomize=randomize,
                   threads=threads,
                   perform_tests=perform_tests,
                   replace_redirects=replace_redirects,
                   type_learning=type_learning)

    @staticmethod
    def split_string_list(string):
        return string.split(',')

    # -------------------------------------------------------------------------------------------------
    #                               Data Preprocessing
    # -------------------------------------------------------------------------------------------------

    def parse_dbpedia_data(self):
        """
        Takes all DBpedia ontology relations (subj verb target) stored in file_name
        and returns a dictionary with subjects as keys and all of their related information
        as dict values.
        more precisely {subj: { verb1: [val1, val2, val3...],
                                verb2: [val1, ...]
                            }
                        }
        """
        entities = dict()
        relation_types_counter = Counter()
        fact_counter = 0
        testing_resources = PatternTester.from_config_file(
        ).get_testing_resources()

        self.logger.print_info('Collecting facts for training...')
        for subject, predicate, object in self.nt_reader.yield_entries():
            if fact_counter == self.facts_limit * self.relation_types_limit:
                break
            if len(
                    relation_types_counter
            ) == self.relation_types_limit and predicate not in relation_types_counter:
                continue
            if relation_types_counter[predicate] == self.facts_limit:
                continue
            if self.relation_types is not None and predicate not in self.relation_types:
                continue
            if subject in testing_resources:
                continue

            # maintain a dict for each entity with given relations as key
            # and their target values as list
            entities.setdefault(subject, {}).setdefault(predicate,
                                                        []).append(object)
            relation_types_counter[predicate] += 1
            fact_counter += 1
        self.logger.print_done('Collecting facts for training completed')

        self.logger.print_info('Relation types:')
        most_common_relation_types = relation_types_counter.most_common()
        for i in range(len(most_common_relation_types)):
            relation_type, frequency = most_common_relation_types[i]
            print('\t' + str(i + 1) + ':\t' + str(frequency) + ' x\t' +
                  relation_type).expandtabs(10)

        return entities

    @staticmethod
    def _chunks(data, size=10000):
        """
        Helper function to divide data evenly for all threads
        """
        it = iter(data)
        for i in xrange(0, len(data), size):
            yield {k: data[k] for k in islice(it, size)}

    def tag_sentences(self, chunk=None):
        if chunk is None:
            chunk = {}
        for entity, values in chunk.iteritems():
            # for each relationship filter sentences that contain
            # target resources of entity's relationship
            for rel, resources in values.iteritems():
                wikipedia_target_resources = map(
                    uri_rewriting.convert_to_internal_wikipedia_link,
                    resources)
                # retrieve tokenized wikipedia sentences that include DBpedia resources that we are looking for
                tagged_sentences = self.wikipedia_connector.get_filtered_wikipedia_article(
                    entity, wikipedia_target_resources)
                values[rel] = {
                    'resources': wikipedia_target_resources,
                    'sentences': tagged_sentences,
                    'patterns': []
                }

    def discover_patterns(self):
        """
        Preprocesses data (initializing main data structure)
        1. Filter relevant DBpedia facts by relationships
        2. Turn DBpedia data into in-memory dictionary where all processing takes place
        3. Fetch relevant Wikipedia articles and filter relevant sentences out of html text (for link search)
        4. Data is stored in self.dbpedia
        """
        # parse dbpedia information
        self.dbpedia = self.parse_dbpedia_data()
        self.logger.print_info('Sentence Extraction...')
        threads = []
        chunk_size = int(ceil(len(self.dbpedia) / self.num_of_threads))
        # gather all arguments for each thread
        for chunk in WikipediaPatternExtractor._chunks(self.dbpedia,
                                                       chunk_size):
            t = Thread(target=self.tag_sentences, kwargs={'chunk': chunk})
            threads.append(t)
        # start all threads
        for x in threads:
            x.start()
        # Wait for all threads to finish
        for x in threads:
            x.join()

    def extract_entity_patterns(self, chunk={}):
        color_mapping = {
            'magenta': ['NN', 'NNS'],
            'green': ['NNP', 'NNPS'],
            'cyan': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
            'yellow': ['JJ', 'JJR', 'JJS']
        }
        # reverse color mapping
        color_mapping = {
            v: k
            for k, values in color_mapping.iteritems() for v in values
        }
        for entity, relations in chunk.iteritems():
            cleaned_subject_entity_name = uri_rewriting.strip_cleaned_name(
                entity)
            subject_entity = uri_rewriting.strip_name(entity)
            for rel_ontology, values in relations.iteritems():
                target_resources = values['resources']
                sentences = values['sentences']
                rel_ontology = rel_ontology.split('/')[-1]
                data = [{
                    'entity': cleaned_subject_entity_name,
                    'relation': rel_ontology,
                    'resource': res,
                    'sentence': sent
                } for res in target_resources for sent in sentences
                        if sent.contains_any_link([res]) and res != entity]

                # remove needless sentence information based on relation facts
                # data = map(self.shorten_sentence, data)
                # POS tag sentences
                for entry in data:
                    sentence = entry['sentence']
                    if sentence.number_of_tokens() > 50:
                        continue  # probably too long for stanford tokenizer
                    resource = entry['resource']
                    nl_sentence = sentence.as_string()
                    relative_position = sentence.relative_pos
                    entry['nl sentence'] = nl_sentence
                    tokenized_sentences = map(word_tokenize, [nl_sentence])
                    pos_tagged_sentences = pos_tag_sents(
                        tokenized_sentences).pop()

                    object_addresses = sentence.addresses_of_link(resource)
                    object_entity = uri_rewriting.strip_name(resource)
                    pattern = self.pattern_extractor.extract_pattern(
                        nl_sentence, object_addresses, relative_position,
                        self.type_learning, subject_entity, object_entity)
                    if pattern is not None:
                        values['patterns'].append(pattern)
                        entry['pattern'] = pattern

                    # color sentence parts according to POS tag
                    colored_sentence = [
                        colored(word, color_mapping.setdefault(pos, 'white'))
                        for word, pos in pos_tagged_sentences
                    ]
                    colored_sentence = ' '.join(colored_sentence)
                    colored_sentence = re.sub(
                        r' (.\[\d+m),', ',',
                        colored_sentence)  # remove space before commas
                    entry['colored_sentence'] = colored_sentence

                self.matches.extend(data)

    # ---------------------------------------------------------------------------------------------
    #                               Statistics and Visualizations
    # ---------------------------------------------------------------------------------------------

    def extract_patterns(self):
        self.logger.print_info('Pattern extraction...')
        threads = []
        chunk_size = int(ceil(len(self.dbpedia) / self.num_of_threads))
        # gather all arguments for each thread
        for chunk in WikipediaPatternExtractor._chunks(self.dbpedia,
                                                       chunk_size):
            t = Thread(target=self.extract_entity_patterns,
                       kwargs={'chunk': chunk})
            threads.append(t)
        # start all threads
        for x in threads:
            x.start()

        # Wait for all threads to finish
        for x in threads:
            x.join()
        # drop duplicates
        self.matches.sort()
        self.matches = list(x for x, _ in itertools.groupby(self.matches))
        self.logger.print_done('Pattern extraction completed')

    def print_occurrences(self):
        """
        Prints each occurrence of a given DBpedia fact with their corresponding and matched sentence.
        The matched sentence is POS tagges using maxent treebank pos tagging model.
        Nouns, verbs and adjectives are printed in colour.
        """

        for entry in self.matches:
            if not entry.get('colored_sentence', None):
                continue
            print(
                colored(
                    '[DBP Entitity] \t', 'red', attrs={'concealed', 'bold'}) +
                colored(entry['entity'], 'white')).expandtabs(20)
            print(
                colored(
                    '[DBP Ontology] \t', 'red', attrs={'concealed', 'bold'}) +
                colored(entry['relation'], 'white')).expandtabs(20)
            print(
                colored(
                    '[DBP Resource] \t', 'red', attrs={'concealed', 'bold'}) +
                colored(uri_rewriting.strip_cleaned_name(entry['resource']),
                        'white')).expandtabs(20)
            print(
                colored(
                    '[Wiki Occurence] \t', 'red', attrs={'concealed', 'bold'})
                + entry['colored_sentence']).expandtabs(20)
            print('')

        print('[POS KEY]\t' + colored('NORMAL NOUN\t', 'magenta') +
              colored('PROPER NOUN\t', 'green') + colored('VERB\t', 'cyan') +
              colored('ADJ\t', 'yellow')).expandtabs(20)

    def count_matches(self):
        matches_count = {}
        for relation, pattern in self.relation_type_patterns.iteritems():
            matches_count[relation] = pattern.covered_sentences
        return matches_count

    def calculate_text_coverage(self):
        """
        Prints CLI stats about percentage of matched dbpedia facts in wiki raw text.
        """
        matched_count = self.count_matches()
        total_count = {}
        for entity, relation_types in self.dbpedia.iteritems():
            for relation, values in relation_types.iteritems():
                target_resources = values.get('resources', [])
                total_count.setdefault(relation, 0)
                total_count[relation] += len(target_resources)

        occurrence_count = {}
        for relation in total_count:
            occurrence_count[relation] = {
                'total':
                total_count[relation],
                'matched':
                min(total_count[relation],
                    matched_count.setdefault(relation, 0))
            }  # there might be more occurrences of a fact in an article, thus, resulting in a coverage above 100%

        # print bar chart
        data = [
            ('%  ' + str(vals['matched']) + '/' + str(vals['total']) + ' ' +
             rel.split('/')[-1], vals['matched'] / vals['total'] * 100)
            for rel, vals in occurrence_count.iteritems()
        ]
        graph = Pyasciigraph()
        for line in graph.graph('occurred facts in percentage', data):
            print(line)

    def merge_patterns(self):
        self.logger.print_info('Pattern merging...')
        for entity, relations in tqdm(self.dbpedia.iteritems()):
            for rel, values in relations.iteritems():
                for pattern in values['patterns']:
                    if rel in self.relation_type_patterns:
                        self.relation_type_patterns[rel] = Pattern._merge(
                            self.relation_type_patterns[rel], pattern,
                            self.perform_tests)
                    else:
                        self.relation_type_patterns[rel] = pattern
        self.logger.print_done('Pattern merging completed.')

    def save_patterns(self):
        self.training_resources = set(self.dbpedia.keys())
        super(WikipediaPatternExtractor, self).save_patterns()