Пример #1
0
def align_word_vectors(w2v_path1, w2v_path2, dict_map_path, output_path,
        source_language='zh', target_language='en'):
    '''use dict_map to align w2v1 to space of w2v2
    '''
    languages = [source_language, target_language]
    print languages
    synonym = load_synonym( dict_map_path, languages=languages)
    # quit()
    vec1 = load_w2v( w2v_path1, np.float32)
    vec2 = load_w2v( w2v_path2, np.float32)
    source_matrix, target_matrix, biwords =\
        build_source_matrix(vec1, vec2, synonym, 1, source_language, target_language)
    print source_matrix.shape, target_matrix.shape
    test_similarity(vec1, vec2, biwords)
    transform = learn_transformation(source_matrix, target_matrix)
    vec1 = apply_transform(vec1, transform)
    test_similarity(vec1, vec2, biwords)

    save_w2v(vec1, output_path)
    for w in vec1:
        if w not in vec2:
            vec2[w]=vec1[w]
    output_path2=utput_path[:-4]+'merged.txt'
    save_w2v(vec2, output_path2)
    test(output_path2)
Пример #2
0
    def __init__(self):
        print 'load original w2v...'
        w2v = load_w2v(config.originalw2v_path)
        id2word={id_:key for id_,key in enumerate(w2v.keys())}
        word2id = dict_reverse(id2word)
        vectors=w2v.values()
        old_len=len(word2id)
        print 'old number of words  = ', old_len
        
        print 'load original w2v finished'

        print 'load synonym words...'
        synonym_words = get_words(config.synonym_path, w2v.keys())
        print 'load synonym words finished'
        print 'synonym words in Kb=', len(set(word2id.keys() ) & set(word2id.keys()))

        print 'update w2v ...'
        #synonym_words=list(synonym_words)[:50000]
        for word in set(synonym_words)-set(word2id.keys()):
            id_=len(id2word)
            id2word[id_] = word
            word2id[word] = id_
            #vectors.append(np.zeros((config.vec_len),dtype=np.float64))
        append_vectors = np.random.uniform(-0.1,0.1,(len(word2id)-old_len, config.vec_len))
        vectors=np.concatenate([np.array(vectors,dtype=np.float16),\
                np.array(append_vectors,dtype=np.float16)], axis=0)
        alpha=old_len*[[1],]+(len(word2id)-old_len)*[[0],]
        self.word2id=word2id
        self.id2word=id2word
        print 'new number of words  = ', len(word2id)

        print 'build graph...'
        with tf.device('/cpu:0'):
            self.build_graph(vectors, alpha)
        print 'build graph finished'
Пример #3
0
def filter(vec_path, all_words, output_vec_path):
    w2v=load_w2v(vec_path)
    new_w2v={}
    for w in all_words:
        if w not in w2v:
            print w, 'not in w2v'
            continue
        v = w2v[w]
        new_w2v[w]=v
    save_w2v(new_w2v, output_vec_path)
Пример #4
0
def semeval17_main_test(vec_path, test_path=None):
    test_path = test_path or df_test_path
    
    w2v=load_w2v(vec_path)
    result={}
    print 'here'
    print test_path
    for root, dirs, files in os.walk(test_path):
        for file_ in files:
            s=os.path.join(root, file_)
            print s
            result[file_]=measure_score(w2v, s)
    for file_ in result:
        print file_, result[file_]
Пример #5
0
def main():
    def check_dir(dir_path, ask_for_del):
        if os.path.exists(dir_path):
            y=''
            if ask_for_del:
                y=raw_input('new empty {}? y/n:'.format(dir_path))
            if y.strip()=='y' or not ask_for_del:
                rmtree(dir_path)
            else:
                print('use a clean summary_dir')
                quit()
        makedirs(dir_path)
        oo=open(os.path.join(dir_path,'config.txt'),'w')
        d={}
        for name in dir(config):
            if '__' in name:continue
            d[name]=getattr(config,name)
        oo.write(json.dumps(d,ensure_ascii=False))
    check_dir(config.summary_dir, config.ask_for_del)
    check_dir(config.model_dir, config.ask_for_del)
    # assuse train and dev must use one emb
    # train and test may with differnent emb
    # for more info, refer to README
    if config.limited_words:
        limited_words=get_words([config.train_data_path, config.dev_data_path]+config.test_data_paths)
        limited_words=limited_words.keys()
    else:
        limited_words=[]
    tags,_ = get_vocab(config.tags_path)
    source_init_idf = target_init_idf = None
    if config.share_emb:
        assert config.target_w2v_path==config.source_w2v_path
        assert config.target_idf_path==config.source_idf_path
        w2v=load_w2v(config.source_w2v_path, max_vocab_size=config.max_vocab_size, limited_words=limited_words, norm=True)
        source_init_embedding=target_init_embedding=np.array(w2v.values(), dtype=np.float32)
        if config.text_repr=='add+idf':
            w2idf = load_w2v(config.source_idf_path)
            def drop(x):
                if x>=3:return x
                else:return 0
            source_init_idf = target_init_idf =\
                np.array([drop(float(w2idf.get(w,7.0))) for w in w2v], dtype=np.float32)
        words={k:w for k,w in enumerate(w2v.keys())}
        source_line_processing = target_line_processing =\
            sequence_label_line_processing(words, tags, max_len = config.sen_len, size=3, return_length=True)
    else:
        source_w2v=load_w2v(config.source_w2v_path, max_vocab_size=config.max_vocab_size, limited_words=limited_words, norm=True)
        target_w2v=load_w2v(config.source_w2v_path, max_vocab_size=config.max_vocab_size, limited_words=limited_words, norm=True)
        source_init_embedding = np.array(source_w2v.values(), dtype=np.float32)
        target_init_embedding = np.array(source_w2v.values(), dtype=np.float32)
        if config.text_repr=='add+idf':
            source_w2idf=load_w2v(config.source_idf_path)
            target_w2idf=load_w2v(config.source_idf_path)
            source_init_idf=np.array([float(source_w2idf.get(w,10.0)) for w in source_w2v], dtype=np.float32)
            target_init_idf=np.array([float(target_w2idf.get(w,10.0)) for w in target_w2v], dtype=np.float32)
        source_words = {k:w for k,w in enumerate(source_w2v.keys())}
        target_words = {k:w for k,w in enumerate(target_w2v.keys())}
        source_line_processing=\
            sequence_label_line_processing(source_words, tags, max_len = config.sen_len, size=3, return_length=True)
        target_line_processing=\
            sequence_label_line_processing(target_words, tags, max_len = config.sen_len, size=3, return_length=True)
   
    # datas
    train_data = LineBasedDataset(config.train_data_path, source_line_processing, batch_size= config.batch_size)
    
    dev_data = LineBasedDataset(config.dev_data_path, source_line_processing, batch_size = config.batch_size)
    test_datas = [LineBasedDataset(path, target_line_processing, batch_size = config.batch_size)
        for path in config.test_data_paths]
    
    # show shape
    for k,inputs in enumerate(train_data):
        print '-'*20,'batch ',k,'-'*20
        for inp in inputs:
            print inp.shape
        if k>=3:break
    
    # compute class weights for class unbalanced
    class_nums=get_label_nums(train_data, tags)
    class_weights=class_nums/np.sum(class_nums)*len(class_nums)
    print 'TRAIN CLASSES=\t',tags.values()
    print 'TRAIN CLASS_NUMS=\t',class_nums
    print 'TRAIN CLASS_WEIGHTS=\t',class_weights
     
    with tf.Session(config=config.session_conf) as sess:
        # use tf.name_scope to manager variable_names
        source_model=TextClassifier(
            num_classes=len(tags), 
            init_embedding=source_init_embedding, 
            init_idf=source_init_idf, 
            class_weights=class_weights,
            emb_name=config.source+'_emb',
            reuse=False,
            mode='train',
            name_scope=config.source)
        if config.share_emb:
            target_model=source_model
        else:
            target_model=TextClassifier(
                num_classes=len(tags), 
                init_embedding=target_init_embedding, 
                init_idf=target_init_idf, 
                class_weights=class_weights,
                emb_name=config.target+'_emb',
                reuse=True,
                mode='eval',
                name_scope=config.target)

        # summary writers for diiferent branch/class
        summary_writers = {
            sub_path:tf.summary.FileWriter(os.path.join(config.summary_dir,sub_path), flush_secs=5)
                for sub_path in ['train','dev','test-1','test-2']}
        class_summary_writers = {
            sub_path:
                {class_name:tf.summary.FileWriter(os.path.join(config.summary_dir,sub_path,class_name), flush_secs=5)
                    for class_name in tags.values()}
                for sub_path in ['train','dev','test-1','test-2']}
        
        # train source
        train(sess, source_model, target_model,
                train_data, dev_data, test_datas, 
                tags=tags.values(),
                summary_writers=summary_writers,
                class_summary_writers=class_summary_writers)