def __init__(self, ablate=()):
        self.wrap = VectorSpaceWrapper(
            get_external_data_filename('numberbatch-20180108-biased.h5'),
            use_db=False)
        self.cache = {}
        self.wp_db = None
        self.sme = None
        self.queries = None
        self.phrases = None
        self.svm = None
        self.ablate = ablate

        self.feature_methods = [
            self.direct_relatedness_features, self.sme_features,
            self.wikipedia_relatedness_features,
            self.wordnet_relatedness_features, self.phrase_hit_features
        ]

        self.feature_names = [
            'ConceptNet vector relatedness',
            'SME: RelatedTo',
            'SME: (x IsA a)',
            'SME: (x HasA a)',
            'SME: (x PartOf a)',
            'SME: (x CapableOf a)',
            'SME: (x UsedFor a)',
            'SME: (x HasContext a)',
            'SME: (x HasProperty a)',
            'SME: (x AtLocation a)',
            'SME: (a PartOf x)',
            'SME: (a AtLocation x)',
            'Wikipedia lead sections',
            'WordNet relatedness',
            'Google Ngrams',
        ]
 def phrase_hit_features(self, example):
     if self.phrases is None:
         self.phrases = sqlite3.connect(
             get_external_data_filename('phrases.db'))
     weight_pair1 = phrase_weight(self.phrases, example.lemma1(),
                                  example.lemma_att())
     weight_pair2 = phrase_weight(self.phrases, example.lemma2(),
                                  example.lemma_att())
     return weight_pair1 - weight_pair2
 def wikipedia_relatedness_features(self, example):
     if self.wp_db is None:
         self.wp_db = sqlite3.connect(
             get_external_data_filename('wikipedia-summary.db'))
     connected1 = [example.node1()] + wikipedia_connected_conceptnet_nodes(
         self.wp_db, example.word1)
     connected2 = [example.node2()] + wikipedia_connected_conceptnet_nodes(
         self.wp_db, example.word2)
     return self.max_relatedness_features(connected1, connected2,
                                          example.att_node())
 def sme_features(self, example):
     if self.sme is None:
         self.sme = StandaloneSMEModel(
             get_external_data_filename('sme-20180129'))
     node1 = example.node1()
     node2 = example.node2()
     att = example.att_node()
     if node1 in self.sme and node2 in self.sme and att in self.sme:
         return self.sme.predict_discriminative_relations(
             node1, att) - self.sme.predict_discriminative_relations(
                 node2, att)
     else:
         return np.zeros(self.sme.num_rels())
def build_wp_database(db, filename):
    db.execute("DROP TABLE IF EXISTS words")
    with db as _transaction:
        for statement in SCHEMA:
            db.execute(statement)

    with db as _transaction:
        num_lines = sum(1 for line in open(filename))
        for line in tqdm(open(filename), total=num_lines):
            title, text = line.split('\t', 1)
            words = wordfreq.tokenize(text.rstrip(), 'en')
            for word in words:
                add_entry(db, title, word)


def add_entry(db, title, word):
    lemma = LEMMATIZER.lookup('en', word)[0]
    title = title.lower().split(" (")[0]
    if wordfreq.zipf_frequency(lemma, 'en') < 6 and wordfreq.zipf_frequency(
            word, 'en') < 6:
        db.execute(
            "INSERT OR IGNORE INTO words (page, word, lemma) VALUES (?, ?, ?)",
            (title, word, lemma))


if __name__ == '__main__':
    filename = get_external_data_filename('en-wp-1word-summaries.txt')
    db = sqlite3.connect(get_external_data_filename('wikipedia-summary.db'))
    build_wp_database(db, filename)
 def __init__(self):
     self.wrap = VectorSpaceWrapper(
         get_external_data_filename('numberbatch-20180108-biased.h5'),
         use_db=False)
     self.cache = {}
Пример #7
0
            if ' ' in phrase:
                first_word, second_word = phrase.split(' ')
                count = int(count)
                add_phrase(db, phrase, first_word, second_word, count)


def add_word(db, word, count):
    lemma = LEMMATIZER.lookup('en', word)[0]

    db.execute(
        "INSERT OR IGNORE INTO words (word, count, lemma) "
        "VALUES (?, ?, ?)", (word, count, lemma))


def add_phrase(db, phrase, first_word, second_word, count):
    first_lemma = LEMMATIZER.lookup('en', first_word)[0]
    second_lemma = LEMMATIZER.lookup('en', second_word)[0]

    db.execute(
        "INSERT OR IGNORE INTO phrases (phrase, first_word, second_word, count, "
        "first_lemma, second_lemma) VALUES (?, ?, ?, ?, ?, ?)",
        (phrase, first_word, second_word, count, first_lemma, second_lemma))


if __name__ == '__main__':
    filename_1grams = get_external_data_filename('google-books-1grams.txt.gz')
    filename_2grams = get_external_data_filename(
        'google-books-2grams-more.txt.gz')
    db = sqlite3.connect(get_external_data_filename('phrases.db'))
    build_phrases_database(db, filename_1grams, filename_2grams)