def __init__(self, ablate=()): self.wrap = VectorSpaceWrapper( get_external_data_filename('numberbatch-20180108-biased.h5'), use_db=False) self.cache = {} self.wp_db = None self.sme = None self.queries = None self.phrases = None self.svm = None self.ablate = ablate self.feature_methods = [ self.direct_relatedness_features, self.sme_features, self.wikipedia_relatedness_features, self.wordnet_relatedness_features, self.phrase_hit_features ] self.feature_names = [ 'ConceptNet vector relatedness', 'SME: RelatedTo', 'SME: (x IsA a)', 'SME: (x HasA a)', 'SME: (x PartOf a)', 'SME: (x CapableOf a)', 'SME: (x UsedFor a)', 'SME: (x HasContext a)', 'SME: (x HasProperty a)', 'SME: (x AtLocation a)', 'SME: (a PartOf x)', 'SME: (a AtLocation x)', 'Wikipedia lead sections', 'WordNet relatedness', 'Google Ngrams', ]
def phrase_hit_features(self, example): if self.phrases is None: self.phrases = sqlite3.connect( get_external_data_filename('phrases.db')) weight_pair1 = phrase_weight(self.phrases, example.lemma1(), example.lemma_att()) weight_pair2 = phrase_weight(self.phrases, example.lemma2(), example.lemma_att()) return weight_pair1 - weight_pair2
def wikipedia_relatedness_features(self, example): if self.wp_db is None: self.wp_db = sqlite3.connect( get_external_data_filename('wikipedia-summary.db')) connected1 = [example.node1()] + wikipedia_connected_conceptnet_nodes( self.wp_db, example.word1) connected2 = [example.node2()] + wikipedia_connected_conceptnet_nodes( self.wp_db, example.word2) return self.max_relatedness_features(connected1, connected2, example.att_node())
def sme_features(self, example): if self.sme is None: self.sme = StandaloneSMEModel( get_external_data_filename('sme-20180129')) node1 = example.node1() node2 = example.node2() att = example.att_node() if node1 in self.sme and node2 in self.sme and att in self.sme: return self.sme.predict_discriminative_relations( node1, att) - self.sme.predict_discriminative_relations( node2, att) else: return np.zeros(self.sme.num_rels())
def build_wp_database(db, filename): db.execute("DROP TABLE IF EXISTS words") with db as _transaction: for statement in SCHEMA: db.execute(statement) with db as _transaction: num_lines = sum(1 for line in open(filename)) for line in tqdm(open(filename), total=num_lines): title, text = line.split('\t', 1) words = wordfreq.tokenize(text.rstrip(), 'en') for word in words: add_entry(db, title, word) def add_entry(db, title, word): lemma = LEMMATIZER.lookup('en', word)[0] title = title.lower().split(" (")[0] if wordfreq.zipf_frequency(lemma, 'en') < 6 and wordfreq.zipf_frequency( word, 'en') < 6: db.execute( "INSERT OR IGNORE INTO words (page, word, lemma) VALUES (?, ?, ?)", (title, word, lemma)) if __name__ == '__main__': filename = get_external_data_filename('en-wp-1word-summaries.txt') db = sqlite3.connect(get_external_data_filename('wikipedia-summary.db')) build_wp_database(db, filename)
def __init__(self): self.wrap = VectorSpaceWrapper( get_external_data_filename('numberbatch-20180108-biased.h5'), use_db=False) self.cache = {}
if ' ' in phrase: first_word, second_word = phrase.split(' ') count = int(count) add_phrase(db, phrase, first_word, second_word, count) def add_word(db, word, count): lemma = LEMMATIZER.lookup('en', word)[0] db.execute( "INSERT OR IGNORE INTO words (word, count, lemma) " "VALUES (?, ?, ?)", (word, count, lemma)) def add_phrase(db, phrase, first_word, second_word, count): first_lemma = LEMMATIZER.lookup('en', first_word)[0] second_lemma = LEMMATIZER.lookup('en', second_word)[0] db.execute( "INSERT OR IGNORE INTO phrases (phrase, first_word, second_word, count, " "first_lemma, second_lemma) VALUES (?, ?, ?, ?, ?, ?)", (phrase, first_word, second_word, count, first_lemma, second_lemma)) if __name__ == '__main__': filename_1grams = get_external_data_filename('google-books-1grams.txt.gz') filename_2grams = get_external_data_filename( 'google-books-2grams-more.txt.gz') db = sqlite3.connect(get_external_data_filename('phrases.db')) build_phrases_database(db, filename_1grams, filename_2grams)