def __init__(self, corpus, ptype, test=False, modelname="crfre_classifier"): super(CrfSuiteRE, self).__init__() self.data = [] self.labels = [] self.scores = [] self.predicted = [] self.entities = [] self.pairtype = ptype self.modelname = ptype + "_" + modelname self.gold_relations = set() self.tair_pairs = load_tair_relations() self.vecmodel = word2vec.load("corpora/Thaliana/documents-processed" + '.bin') with codecs.open("seedev_relation.txt", 'r', 'utf-8') as relfile: for r in relfile: self.gold_relations.add(r.strip()) self.clusters = word2vec.load_clusters( "corpora/Thaliana/documents-processed-clusters.txt") #with codecs.open("corpora/Thaliana/documents-clusters.txt", "r", "utf-8") as clusterfile: # for l in clusterfile: # values = l.strip().split(" ") # self.clusters[values[0]] = values[1] self.generate_data(corpus, self.modelname, ptype, test)
def create_voabulary_labelO(): model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') #zhihu-word2vec.bin-100 count=0 vocabulary_word2index_label={} vocabulary_index2word_label={} label_unique={} for i,vocab in enumerate(model.vocab): if '__label__' in vocab: #'__label__-2051131023989903826 label=vocab[vocab.index('__label__')+len('__label__'):] if label_unique.get(label,None) is None: #不曾出现过的话,保持到字典中 vocabulary_word2index_label[label]=count vocabulary_index2word_label[count]=label #ADD count=count+1 label_unique[label]=label return vocabulary_word2index_label,vocabulary_index2word_label
def assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textCNN, word2vec_model_path): print("using pre-trained word emebedding.started.word2vec_model_path:", word2vec_model_path) word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [ [] ] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros( FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0 count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[ word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform( -bound, bound, FLAGS.embed_size) count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array( word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign( textCNN.Embedding, word_embedding ) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding) print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...")
def match_relations(reltype, docfile_root="corpora/Thaliana/documents-processed"): model = word2vec.load(docfile_root + '.bin') gold_relations = [] with open("seedev_relation.txt") as f: gold_relations = f.readlines() unmatched1, unmatched2 = 0, 0 for r in gold_relations: values = r.split("\t") if values[1] == reltype: entity1 = values[0].split("#")[1] entity2 = values[2].split("#")[1] #print entity1, if entity1 in model: indexes, metrics = model.cosine(entity1, n=1) #print model.generate_response(indexes, metrics).tolist() else: entity1 = entity1.split(" ")[0] if entity1 in model: indexes, metrics = model.cosine(entity1, n=1) #print model.generate_response(indexes, metrics).tolist() else: unmatched1 += 1 #print #print entity2, if entity2 in model: indexes, metrics = model.cosine(entity2, n=5) #print model.generate_response(indexes, metrics).tolist() else: entity2 = entity2.split(" ")[0] if entity2 in model: indexes, metrics = model.cosine(entity2, n=5) #print model.generate_response(indexes, metrics).tolist() else: unmatched2 += 1 #print #print "========================================" print(unmatched1, unmatched2)
miniBatch_size = 64 f = pd.read_csv("testEchantillonnage1.csv").values[:, 1:] print("\n*** La taille du dataSet est de : ", len(f[:, 0]), " lignes ***") token = [] for i in range(0, len(f[:, 3])): st = str(f[i, 3]).lower().decode('utf8') token.append([nltk.word_tokenize(st)]) print("*** finished to tokenize ***") print("*** Start Word2Vec with ", len(token), " elements *** ") if LOAD_VOCAB: print("** Starting Word2Vec loading.") w2v = word2vec.load('weights/FR.vocab') print("** Word2Vec loading ended.") else: print("** Starting Word2Vec saving.") w2v = word2vec.save(token, 'weights/FR.vocab') print("** Word2Vec saving ended.") w2v = word2vec.save(token, 'FR.vocab') print("\n*** Word2Vec processed ***\n") keys = w2v.wv.vocab.keys() vocabIndex = {} for i in range(len(keys)): w = keys[i] vocabIndex[w] = w2v.wv.vocab[w].index if i % 500 == 0: