def align_word_vectors(w2v_path1, w2v_path2, dict_map_path, output_path, source_language='zh', target_language='en'): '''use dict_map to align w2v1 to space of w2v2 ''' languages = [source_language, target_language] print languages synonym = load_synonym( dict_map_path, languages=languages) # quit() vec1 = load_w2v( w2v_path1, np.float32) vec2 = load_w2v( w2v_path2, np.float32) source_matrix, target_matrix, biwords =\ build_source_matrix(vec1, vec2, synonym, 1, source_language, target_language) print source_matrix.shape, target_matrix.shape test_similarity(vec1, vec2, biwords) transform = learn_transformation(source_matrix, target_matrix) vec1 = apply_transform(vec1, transform) test_similarity(vec1, vec2, biwords) save_w2v(vec1, output_path) for w in vec1: if w not in vec2: vec2[w]=vec1[w] output_path2=utput_path[:-4]+'merged.txt' save_w2v(vec2, output_path2) test(output_path2)
def __init__(self): print 'load original w2v...' w2v = load_w2v(config.originalw2v_path) id2word={id_:key for id_,key in enumerate(w2v.keys())} word2id = dict_reverse(id2word) vectors=w2v.values() old_len=len(word2id) print 'old number of words = ', old_len print 'load original w2v finished' print 'load synonym words...' synonym_words = get_words(config.synonym_path, w2v.keys()) print 'load synonym words finished' print 'synonym words in Kb=', len(set(word2id.keys() ) & set(word2id.keys())) print 'update w2v ...' #synonym_words=list(synonym_words)[:50000] for word in set(synonym_words)-set(word2id.keys()): id_=len(id2word) id2word[id_] = word word2id[word] = id_ #vectors.append(np.zeros((config.vec_len),dtype=np.float64)) append_vectors = np.random.uniform(-0.1,0.1,(len(word2id)-old_len, config.vec_len)) vectors=np.concatenate([np.array(vectors,dtype=np.float16),\ np.array(append_vectors,dtype=np.float16)], axis=0) alpha=old_len*[[1],]+(len(word2id)-old_len)*[[0],] self.word2id=word2id self.id2word=id2word print 'new number of words = ', len(word2id) print 'build graph...' with tf.device('/cpu:0'): self.build_graph(vectors, alpha) print 'build graph finished'
def filter(vec_path, all_words, output_vec_path): w2v=load_w2v(vec_path) new_w2v={} for w in all_words: if w not in w2v: print w, 'not in w2v' continue v = w2v[w] new_w2v[w]=v save_w2v(new_w2v, output_vec_path)
def semeval17_main_test(vec_path, test_path=None): test_path = test_path or df_test_path w2v=load_w2v(vec_path) result={} print 'here' print test_path for root, dirs, files in os.walk(test_path): for file_ in files: s=os.path.join(root, file_) print s result[file_]=measure_score(w2v, s) for file_ in result: print file_, result[file_]
def main(): def check_dir(dir_path, ask_for_del): if os.path.exists(dir_path): y='' if ask_for_del: y=raw_input('new empty {}? y/n:'.format(dir_path)) if y.strip()=='y' or not ask_for_del: rmtree(dir_path) else: print('use a clean summary_dir') quit() makedirs(dir_path) oo=open(os.path.join(dir_path,'config.txt'),'w') d={} for name in dir(config): if '__' in name:continue d[name]=getattr(config,name) oo.write(json.dumps(d,ensure_ascii=False)) check_dir(config.summary_dir, config.ask_for_del) check_dir(config.model_dir, config.ask_for_del) # assuse train and dev must use one emb # train and test may with differnent emb # for more info, refer to README if config.limited_words: limited_words=get_words([config.train_data_path, config.dev_data_path]+config.test_data_paths) limited_words=limited_words.keys() else: limited_words=[] tags,_ = get_vocab(config.tags_path) source_init_idf = target_init_idf = None if config.share_emb: assert config.target_w2v_path==config.source_w2v_path assert config.target_idf_path==config.source_idf_path w2v=load_w2v(config.source_w2v_path, max_vocab_size=config.max_vocab_size, limited_words=limited_words, norm=True) source_init_embedding=target_init_embedding=np.array(w2v.values(), dtype=np.float32) if config.text_repr=='add+idf': w2idf = load_w2v(config.source_idf_path) def drop(x): if x>=3:return x else:return 0 source_init_idf = target_init_idf =\ np.array([drop(float(w2idf.get(w,7.0))) for w in w2v], dtype=np.float32) words={k:w for k,w in enumerate(w2v.keys())} source_line_processing = target_line_processing =\ sequence_label_line_processing(words, tags, max_len = config.sen_len, size=3, return_length=True) else: source_w2v=load_w2v(config.source_w2v_path, max_vocab_size=config.max_vocab_size, limited_words=limited_words, norm=True) target_w2v=load_w2v(config.source_w2v_path, max_vocab_size=config.max_vocab_size, limited_words=limited_words, norm=True) source_init_embedding = np.array(source_w2v.values(), dtype=np.float32) target_init_embedding = np.array(source_w2v.values(), dtype=np.float32) if config.text_repr=='add+idf': source_w2idf=load_w2v(config.source_idf_path) target_w2idf=load_w2v(config.source_idf_path) source_init_idf=np.array([float(source_w2idf.get(w,10.0)) for w in source_w2v], dtype=np.float32) target_init_idf=np.array([float(target_w2idf.get(w,10.0)) for w in target_w2v], dtype=np.float32) source_words = {k:w for k,w in enumerate(source_w2v.keys())} target_words = {k:w for k,w in enumerate(target_w2v.keys())} source_line_processing=\ sequence_label_line_processing(source_words, tags, max_len = config.sen_len, size=3, return_length=True) target_line_processing=\ sequence_label_line_processing(target_words, tags, max_len = config.sen_len, size=3, return_length=True) # datas train_data = LineBasedDataset(config.train_data_path, source_line_processing, batch_size= config.batch_size) dev_data = LineBasedDataset(config.dev_data_path, source_line_processing, batch_size = config.batch_size) test_datas = [LineBasedDataset(path, target_line_processing, batch_size = config.batch_size) for path in config.test_data_paths] # show shape for k,inputs in enumerate(train_data): print '-'*20,'batch ',k,'-'*20 for inp in inputs: print inp.shape if k>=3:break # compute class weights for class unbalanced class_nums=get_label_nums(train_data, tags) class_weights=class_nums/np.sum(class_nums)*len(class_nums) print 'TRAIN CLASSES=\t',tags.values() print 'TRAIN CLASS_NUMS=\t',class_nums print 'TRAIN CLASS_WEIGHTS=\t',class_weights with tf.Session(config=config.session_conf) as sess: # use tf.name_scope to manager variable_names source_model=TextClassifier( num_classes=len(tags), init_embedding=source_init_embedding, init_idf=source_init_idf, class_weights=class_weights, emb_name=config.source+'_emb', reuse=False, mode='train', name_scope=config.source) if config.share_emb: target_model=source_model else: target_model=TextClassifier( num_classes=len(tags), init_embedding=target_init_embedding, init_idf=target_init_idf, class_weights=class_weights, emb_name=config.target+'_emb', reuse=True, mode='eval', name_scope=config.target) # summary writers for diiferent branch/class summary_writers = { sub_path:tf.summary.FileWriter(os.path.join(config.summary_dir,sub_path), flush_secs=5) for sub_path in ['train','dev','test-1','test-2']} class_summary_writers = { sub_path: {class_name:tf.summary.FileWriter(os.path.join(config.summary_dir,sub_path,class_name), flush_secs=5) for class_name in tags.values()} for sub_path in ['train','dev','test-1','test-2']} # train source train(sess, source_model, target_model, train_data, dev_data, test_datas, tags=tags.values(), summary_writers=summary_writers, class_summary_writers=class_summary_writers)