Exemplo n.º 1
0
def test_parallel(start):
    pairs_file = open(os.path.join(os.pardir, "data", "pairs.pkl"), 'rb')
    pairs = pickle.load(pairs_file)
    pairs_file.close()

    synonyms_file = open(os.path.join(os.pardir, "data", "synonyms.pkl"), 'rb')
    synonyms = pickle.load(synonyms_file)
    synonyms_file.close()
    pattern_matcher = PatternMatcher()
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='******'.format(sys.argv[1]),
                                 db='stackoverflow')
    with connection.cursor() as cursor:
        sql = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id >= {} AND Id < {}".format(
            start, start + 100)
        cursor.execute(sql)
        for i in range(cursor.rowcount):
            # post_count += 1
            current_id, row = cursor.fetchone()
            word_list = get_words(row)
            # total_sent_count += len(word_list)

            for words in word_list:
                rtn = check_tech_pairs(words)
                if rtn is not None:
                    words = rtn[0].split(" ")
                    pattern_matcher.match_pattern(words, current_id, rtn[1],
                                                  "keytechs")
Exemplo n.º 2
0
    def __init__(self,
                 articles_limit,
                 use_dump=False,
                 randomize=False,
                 match_threshold=0.005,
                 type_matching=True,
                 allow_unknown_entity_types=True,
                 print_interim_results=True,
                 threads=4,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 patterns_input_path=dir_path + '../data/patterns_cleaned.pkl',
                 facts_output_path=dir_path + '../results/extracted_facts.nt',
                 extended_facts_output_path=dir_path +
                 '../results/extracted_facts_extended.txt'):
        super(FactExtractor, self).__init__(patterns_input_path)
        self.articles_limit = articles_limit
        self.use_dump = use_dump
        self.allow_unknown_entity_types = allow_unknown_entity_types
        self.match_threshold = match_threshold
        self.type_matching = type_matching
        self.nt_reader = NTReader(resources_path, randomize)
        self.wikipedia_connector = WikipediaConnector(self.use_dump)
        self.pattern_extractor = PatternExtractor()
        self.pattern_matcher = PatternMatcher()
        self.print_interim_results = print_interim_results
        self.discovery_resources = set()
        self.extracted_facts = []
        self.threads = threads
        self.nt_writer = NTWriter(facts_output_path)
        self.extended_facts_output_path = extended_facts_output_path

        # self._make_pattern_types_transitive()
        self._load_discovery_resources()
 def __init__(self, nlp):
     pattern = [{'POS': 'NOUN'},
                {'IS_PUNCT': True, 'OP': '?'},
                {'LOWER': 'including'},
                {'POS': 'NOUN'}]
     self.__matcherId = "including"
     PatternMatcher.__init__(self, pattern, nlp, self.__matcherId)
Exemplo n.º 4
0
def extract_results(labels, original_data, review_bodys):
    # print number of elements in each cluster
    cluster_counts = Counter(labels)
    print(cluster_counts)

    clusters = {}
    pattern_matcher = PatternMatcher()
    # find and print dbscan result on actual text data - review_bodys
    for i in set(labels):
        if i != -1:  # do not print if noise (-1)
            clusters[i] = []
            print(i, "----")
            for x in range(len(review_bodys)):
                if labels[x] == i:

                    print(">>>", (review_bodys[x]))

                    sentence = get_review(review_bodys[x])
                    matches = pattern_matcher.find_matches(sentence)
                    clusters[i].append((review_bodys[x][0], sentence, matches))

                    print(clusters[i], "\n")
    print(clusters)
    input()
    return pattern_matcher.extract_objects(clusters)
Exemplo n.º 5
0
 def __init__(self, nlp):
     pattern = [
         {'POS': 'NOUN'},
         {'Lower': 'and'},
         {'Lower': 'other'},
         {'POS': 'NOUN'},
     ]
     self.__matcherId = "andOther"
     PatternMatcher.__init__(self, pattern, nlp, self.__matcherId)
Exemplo n.º 6
0
 def __init__(self, nlp):
     pattern = [
         {'POS': 'NOUN'},
         {'IS_PUNCT': True, 'OP': '?'},
         {'LOWER': 'especially'},
         {'POS': 'NOUN'},
     ]
     self.__matcherId = 'especially'
     PatternMatcher.__init__(self, pattern, nlp, self.__matcherId)
Exemplo n.º 7
0
 def __init__(self, nlp):
     pattern = [{
         'POS': 'NOUN'
     }, {
         'LOWER': 'and'
     }, {
         'LOWER': 'other'
     }, {
         'POS': 'NOUN'
     }]
     PatternMatcher.__init__(self, pattern, nlp, "andOther")
Exemplo n.º 8
0
def with_patterns_to_pages(query, word):
    # ""で厳密なマッチャで検索する。
    # word => "で", "に" など
    pm = PatternMatcher('"' + word + query + '"')
    pages = pm.bing_search()
    for page in pages:
        page.build_keyword(word + query)
        page.pattern_word = word
        page.query = query
    # page.keywordが''だったら最後に返すpageに入れない
    return [page for page in pages if page.keyword]
 def __init__(self, nlp):
     pattern = [{
         'POS': 'NOUN'
     }, {
         'IS_PUNCT': True,
         'OP': '?'
     }, {
         'LOWER': 'such'
     }, {
         'LOWER': 'as'
     }, {
         'POS': 'NOUN'
     }]
     PatternMatcher.__init__(self, pattern, nlp, "suchAs")
    def __init__(self, nlp):
        '''
        "look for structures containing 4 words: 
        1. the first word is a NOUN (POS stands for Part-Of-Speech), 
        2. second word is <<and>>, 
        3. third is <<other>>
        4.  and the last word is also a Noun".
        '''

        pattern = [{
            'POS': 'NOUN'
        }, {
            'LOWER': 'and'
        }, {
            'LOWER': 'other'
        }, {
            'POS': 'NOUN'
        }]
        PatternMatcher.__init__(self, pattern, nlp, "andOther")
Exemplo n.º 11
0
class FactExtractor(PatternTool):
    def __init__(self,
                 articles_limit,
                 use_dump=False,
                 randomize=False,
                 match_threshold=0.005,
                 type_matching=True,
                 allow_unknown_entity_types=True,
                 print_interim_results=True,
                 threads=4,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 patterns_input_path=dir_path + '../data/patterns_cleaned.pkl',
                 facts_output_path=dir_path + '../results/extracted_facts.nt',
                 extended_facts_output_path=dir_path +
                 '../results/extracted_facts_extended.txt'):
        super(FactExtractor, self).__init__(patterns_input_path)
        self.articles_limit = articles_limit
        self.use_dump = use_dump
        self.allow_unknown_entity_types = allow_unknown_entity_types
        self.match_threshold = match_threshold
        self.type_matching = type_matching
        self.nt_reader = NTReader(resources_path, randomize)
        self.wikipedia_connector = WikipediaConnector(self.use_dump)
        self.pattern_extractor = PatternExtractor()
        self.pattern_matcher = PatternMatcher()
        self.print_interim_results = print_interim_results
        self.discovery_resources = set()
        self.extracted_facts = []
        self.threads = threads
        self.nt_writer = NTWriter(facts_output_path)
        self.extended_facts_output_path = extended_facts_output_path

        # self._make_pattern_types_transitive()
        self._load_discovery_resources()

    @classmethod
    def from_config_file(cls):
        config_parser = cls.get_config_parser()
        use_dump = config_parser.getboolean('general', 'use_dump')
        randomize = config_parser.getboolean('fact_extractor', 'randomize')
        articles_limit = config_parser.getint('fact_extractor',
                                              'articles_limit')
        match_threshold = config_parser.getfloat('fact_extractor',
                                                 'match_threshold')
        type_matching = config_parser.getboolean('fact_extractor',
                                                 'type_matching')
        allow_unknown_entity_types = config_parser.getboolean(
            'fact_extractor', 'allow_unknown_entity_types')
        num_of_threads = config_parser.getint('fact_extractor', 'threads')
        return cls(articles_limit,
                   use_dump,
                   randomize,
                   match_threshold,
                   type_matching,
                   allow_unknown_entity_types,
                   threads=num_of_threads)

    def _make_pattern_types_transitive(self):
        for relation, pattern in self.relation_type_patterns.iteritems():
            pattern.subject_type_frequencies = self.pattern_extractor \
                .get_transitive_types(pattern.subject_type_frequencies)
            pattern.object_type_frequencies = self.pattern_extractor \
                .get_transitive_types(pattern.object_type_frequencies)

    @staticmethod
    def flat_map(list_of_lists):
        return [item for list in list_of_lists for item in list]

    def _load_discovery_resources(self):
        article_counter = 0
        valid_types = set(
            FactExtractor.flat_map(
                self._get_specific_type_frequencies('subject').values()))

        self.logger.print_info('Collecting entities for fact extraction...')
        for subject, predicate, object in self.nt_reader.yield_entries():
            if article_counter == self.articles_limit:
                break
            if subject in self.training_resources or subject in self.discovery_resources:
                continue
            subject_types = set(
                self.pattern_extractor.get_entity_types(subject).keys())
            if (self.allow_unknown_entity_types and len(subject_types) == 0) \
                    or len(subject_types & valid_types) > 0:
                self.discovery_resources.add(subject)
                article_counter += 1

        self.logger.print_done(
            'Collecting entities for fact extraction completed: ' +
            str(len(self.discovery_resources)) + ' articles')

    def _match_pattern_against_relation_type_patterns(self, pattern,
                                                      reasonable_relations):
        matching_relations = []
        for relation in reasonable_relations:
            relation_pattern = self.relation_type_patterns[relation]
            match_score = self.pattern_matcher.match_patterns(
                relation, relation_pattern, pattern, self.type_matching,
                self.allow_unknown_entity_types)
            if match_score >= self.match_threshold:
                matching_relations.append((relation, match_score))
        return matching_relations

    def _filter_reasonable_relations(self, entity, types_of_relations):
        reasonable_relations = set()
        entity_types = self.pattern_extractor.get_entity_types(entity)
        if self.allow_unknown_entity_types and len(entity_types) == 0:
            reasonable_relations = set(types_of_relations.keys())
        else:
            for relation, types in types_of_relations.iteritems():
                assert types is not None
                # Otherwise types were not learned in the training step.
                # In this case you probably have to adjust the config file and rerun the training step.
                if len(entity_types & types) > 0:
                    reasonable_relations.add(relation)
        return reasonable_relations

    def _get_specific_type_frequencies(self, subject_or_object):
        if subject_or_object == 'subject':
            return {
                relation: pattern.subject_type_frequencies
                for relation, pattern in
                self.relation_type_patterns.iteritems()
            }
        elif subject_or_object == 'object':
            return {
                relation: pattern.object_type_frequencies
                for relation, pattern in
                self.relation_type_patterns.iteritems()
            }
        else:
            assert False

    def _extract_facts_from_sentences(self, sentences, subject_entity=None):
        facts = []
        if self.type_matching:
            reasonable_relations_for_subject = self._filter_reasonable_relations(
                subject_entity, self._get_specific_type_frequencies('subject'))
        for sentence in sentences:
            if sentence.number_of_tokens() > 50:
                continue  # probably too long for stanford tokenizer
            relative_position = sentence.relative_pos
            nl_sentence = sentence.as_string()
            object_addresses_of_links = sentence.addresses_of_dbpedia_links()
            for object_link, object_addresses in object_addresses_of_links.iteritems(
            ):
                object_entity = uri_rewriting.strip_name(object_link)
                if self.type_matching:
                    reasonable_relations_for_object = self._filter_reasonable_relations(
                        object_entity,
                        self._get_specific_type_frequencies('object'))
                    reasonable_relations = reasonable_relations_for_subject & reasonable_relations_for_object
                else:
                    reasonable_relations = self.relation_type_patterns

                if not len(reasonable_relations):
                    continue

                pattern = self.pattern_extractor.extract_pattern(
                    nl_sentence, object_addresses, relative_position,
                    self.type_matching, subject_entity, object_entity)
                if pattern is None:
                    continue

                matching_relations = self._match_pattern_against_relation_type_patterns(
                    pattern, reasonable_relations)
                new_facts = [(predicate, object_link, score, nl_sentence)
                             for (predicate, score) in matching_relations]
                facts.extend(new_facts)
        return facts

    def extract_facts_from_html(self, html, resource):
        tagged_sentences = TaggedSentence.from_html(html)
        referenced_sentences = filter(lambda sent: sent.contains_any_link(),
                                      tagged_sentences)
        if self.type_matching:
            subject_entity = uri_rewriting.strip_name(resource)
        else:
            subject_entity = None
        facts = self._extract_facts_from_sentences(referenced_sentences,
                                                   subject_entity)
        facts = [(resource, predicate, object, score, nl_sentence)
                 for (predicate, object, score, nl_sentence) in facts]
        if self.print_interim_results:
            for fact in facts:
                print(fact)
        return facts

    def _extract_facts_from_resource(self, chunk=None):
        self.logger.print_info('--- start fact extraction thread ----')
        if chunk is None:
            chunk = set()
        facts = []
        for resource in chunk:
            wikipedia_resource = uri_rewriting.convert_to_wikipedia_resource_uri(
                resource)
            self.logger.print_info('--- ' + wikipedia_resource + ' ----')
            html = self.wikipedia_connector.get_wikipedia_article_html(
                resource)
            facts.extend(self.extract_facts_from_html(html, resource))

        self.extracted_facts.extend(facts)

    @staticmethod
    def _chunks(data, size=10000):
        """ Yield successive n-sized chunks from input. """
        for i in range(0, len(data), size):
            yield data[i:i + size]

    def _remove_dead_objects(self):
        self.extracted_facts = filter(
            lambda (subject, predicate, object, score, nl_sentence):
            'redlink=1' not in object, self.extracted_facts)

    def extract_facts(self):
        self.logger.print_info('Fact extraction...')
        chunk_size = int(ceil(len(self.discovery_resources) / self.threads))
        threads = []
        # gather resources for each thread
        for chunk in self._chunks(list(self.discovery_resources), chunk_size):
            t = Thread(target=self._extract_facts_from_resource,
                       kwargs={'chunk': chunk})
            threads.append(t)
            # start all threads
        for t in threads:
            t.start()
        # wait for all threads to finish
        for t in threads:
            t.join()
        self._remove_dead_objects()
        self.extracted_facts.sort(key=lambda fact: fact[0][3], reverse=True)
        self.logger.print_done('Fact extraction completed')

    def save_extracted_facts(self):
        short_facts = [(subject, predicate, object)
                       for (subject, predicate, object, socre,
                            nl_sentence) in self.extracted_facts]
        self.nt_writer.write_nt(short_facts)

        with codecs.open(self.extended_facts_output_path, 'wb',
                         'utf-8') as fout:
            self.logger.print_info('\n\nSaving extended facts to "' +
                                   self.extended_facts_output_path + '"...')
            for fact in tqdm(self.extracted_facts):
                fout.write(str(fact) + '\n')

    @property
    def training_relation_types(self):
        return self.relation_type_patterns.keys()

    def set_print_interim_results(self, boolean):
        self.print_interim_results = boolean
Exemplo n.º 12
0
        try:
            keyword = page.noun_before_query(page.snippet,
                                             constants.ACTION_WORD_IN_QUERY)
        except (ValueError, IndexError):
            continue
        if keyword:
            keywords.add(keyword)
    print(keywords)

    # 〜〜を使う、の〜〜も最終結果に入れる
    results_dic = {}
    for keyword in keywords:
        results_dic[keyword] = set()

    for keyword in keywords:
        pm = PatternMatcher(constants.QUERY + ' "' + constants.SO_CALLED +
                            keyword + '"')
        keyword_pages = pm.bing_search()
        for page in keyword_pages:
            try:
                result = page.noun_before_query(page.snippet,
                                                constants.SO_CALLED + keyword)
            except (ValueError, IndexError):
                continue
            if result:
                # results_dic[keyword] => set()からset(['アレロック', 'アルガード'])
                results_dic[keyword].add(result)

    with open(constants.PICKLE_RESULT_DICT_NAME, 'wb') as f:
        pickle.dump(results_dic, f)

    pdb.set_trace()
Exemplo n.º 13
0
def search_web_pages(query):
    pm = PatternMatcher(query)
    pages = pm.bing_search()
    return pages
Exemplo n.º 14
0
def search_web_pages(query):
    pm = PatternMatcher(query)
    pages = pm.bing_search()
    return pages
    for page in pages:
        try:
            keyword = page.noun_before_query(page.snippet, constants.ACTION_WORD_IN_QUERY)
        except (ValueError, IndexError):
            continue
        if keyword:
            keywords.add(keyword)
    print(keywords)

    # 〜〜を使う、の〜〜も最終結果に入れる
    results_dic = {}
    for keyword in keywords:
        results_dic[keyword] = set()

    for keyword in keywords:
        pm = PatternMatcher(constants.QUERY + ' "' + constants.SO_CALLED + keyword + '"')
        keyword_pages = pm.bing_search()
        for page in keyword_pages:
            try:
                result = page.noun_before_query(page.snippet, constants.SO_CALLED + keyword)
            except (ValueError, IndexError):
                continue
            if result:
                # results_dic[keyword] => set()からset(['アレロック', 'アルガード'])
                results_dic[keyword].add(result)

    with open(constants.PICKLE_RESULT_DICT_NAME, 'wb') as f:
        pickle.dump(results_dic, f)

    pdb.set_trace()
Exemplo n.º 16
0
import sys
from pattern_matcher import PatternMatcher, PatternNode


if __name__ == '__main__':
    _input = map(lambda line: line.rstrip('\n') , sys.stdin)

    total_patterns = int(_input[0])
    patterns = _input[1 : total_patterns + 1]
    pattern_matcher = PatternMatcher()
    for pattern in patterns:
        pattern_matcher.insert(pattern)

    total_paths = int(_input[total_patterns+1])
    path_starts = total_patterns + 2
    path_ends = path_starts + total_paths
    paths = map(lambda path: path.strip('/'), _input[path_starts : path_ends])
    for path in paths:
        print pattern_matcher.find_match(path)