def build_graph(data, mode, graph=None, rel=None, pos_list=None, modelpath=None): ''' 4 usages: graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt') graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim']) graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt') graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim']) ''' word_set = set() graph = graph if graph else collections.defaultdict(dict) wnl = nltk.WordNetLemmatizer() if modelpath: model = txt_to_wordvecmodel(modelpath) model_wordset = set(model.keys()) else: model = None for i, word in enumerate(data): #225534 words in data if i % 10000 == 0: print i if type(word) == str: word = word.decode('utf-8') word = word.lower() word_exist_in_wordnet = wn.morphy(word) ### bring lots of noises if word_exist_in_wordnet: word = wnl.lemmatize(word) word_set.add(word) ## 35304 remain pos_list = pos_filter(word, LJ40K, ['s','a']) if pos_list == None: pos_list=[None] for i,p in enumerate(pos_list): related_words = get_all_synonyms_antonyms(word, rel, p) rws = list(related_words) if rws: for w in rws: related_word = w[0].lower() related_word_type = w[2] if model: if related_word not in model_wordset or word not in model_wordset: continue if mode == 'calculate_path': graph_for_calculate_path(word, related_word, graph, model) elif mode == 'calculate_score': graph_for_calculate_score(word, related_word, related_word_type, graph, model) else: if mode == 'calculate_path': graph_for_calculate_path(word, related_word, graph) elif mode == 'calculate_score': graph_for_calculate_score(word, related_word, related_word_type, graph) return graph, word_set
def build_graph(data, mode, graph=None, rel=None, pos_list=None, modelpath=None): ''' 4 usages: graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt') graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim']) graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt') graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim']) ''' word_set = set() graph = graph if graph else collections.defaultdict(dict) wnl = nltk.WordNetLemmatizer() if modelpath: model = txt_to_wordvecmodel(modelpath) model_wordset = set(model.keys()) else: model = None for i, word in enumerate(data): #225534 words in data if i % 10000 == 0: print i if type(word) == str: word = word.decode('utf-8') word = word.lower() word_exist_in_wordnet = wn.morphy(word) ### bring lots of noises if word_exist_in_wordnet: word = wnl.lemmatize(word) word_set.add(word) ## 35304 remain pos_list = pos_filter(word, LJ40K, ['s', 'a']) if pos_list == None: pos_list = [None] for i, p in enumerate(pos_list): related_words = get_all_synonyms_antonyms(word, rel, p) rws = list(related_words) if rws: for w in rws: related_word = w[0].lower() related_word_type = w[2] if model: if related_word not in model_wordset or word not in model_wordset: continue if mode == 'calculate_path': graph_for_calculate_path( word, related_word, graph, model) elif mode == 'calculate_score': graph_for_calculate_score( word, related_word, related_word_type, graph, model) else: if mode == 'calculate_path': graph_for_calculate_path( word, related_word, graph) elif mode == 'calculate_score': graph_for_calculate_score( word, related_word, related_word_type, graph) return graph, word_set
# -*- coding: utf8 -*- import sys, os sys.path.append("../") from models.modeltools import txt_to_wordvecmodel, wordvecmodel_to_txt, wordvecmodel_filter import cPickle as pickle ''' input: model1, model2, filter_set, filepath1, filepath2 ''' if __name__ == '__main__': model1 = txt_to_wordvecmodel(filepath='../data/model/glove.42B.300d.txt') model2 = pickle.load(open('../data/model/model_wordvec_semantic_similarity_lemma_35304_42b_Feeling_Wheel.pkl')) filter_set = pickle.load(open('../data/wordset/wordsetlemma_basickeyword_LJ40K_FeelingWheel_1122.pkl')) model1 = wordvecmodel_filter(model1, filter_set) model2 = wordvecmodel_filter(model2, filter_set) filter_set = set(model1.keys()) & set(model2.keys()) model1 = wordvecmodel_filter(model1, filter_set) model2 = wordvecmodel_filter(model2, filter_set) print 'length of the filtered model1 is ',len(model1) print 'length of the filtered model2 is ',len(model2) filepath1 = '../textSNE/testdata/w2v_worddict_42B_1122.txt' filepath2 = '../textSNE/testdata/model_wordvec_semantic_similarity_lemma_35304_42b_Feeling_Wheel_1122.txt'