예제 #1
0
        if words_list[w] in vector_map:
            v = vector_map[words_list[w]]
            v /= math.sqrt(v.dot(v)) # hurts performance slightly on monolingual
            word_vect_matrix[w] = v
        else:
            num_no_vect += 1
    print '%s words have no vector' % num_no_vect
    return word_vect_matrix


source_language = 'spanish'
target_language = 'english07'
pair_filename = 'word_pairs/es-en.pair'
reverse_pair = False

target_vectors, target_vect_size = word_vect_loader.load('pos_data/'+target_language+'.train.sent.vec')
source_vectors, source_vect_size = word_vect_loader.load('pos_data/'+source_language+'.train.sent.vec')
assert source_vect_size == target_vect_size
vect_size = source_vect_size

print 'loading'
source_text_sentences = load_and_save.read_sentences_from_file('pos_data/conll-'+source_language+'.pos')
source_sentences, source_words, source_sentences_pos, _ = load_and_save.integer_sentences(source_text_sentences, pos=universal_pos_tags, max_words=10000)
source_test_sentences = load_and_save.read_sentences_from_file('pos_data/conll-'+source_language+'-test.pos')
source_test_sentences, _, source_test_sentences_pos, _ = load_and_save.integer_sentences(source_test_sentences, pos=universal_pos_tags, words=source_words)

target_text_sentences = load_and_save.read_sentences_from_file('pos_data/conll-'+target_language+'-test.pos')
target_sentences, target_words, target_sentences_pos, _ = load_and_save.integer_sentences(target_text_sentences, pos=universal_pos_tags, max_words=10000)

source_vector_matrix = make_vector_matrix(source_words, source_vectors, vect_size)
target_vector_matrix = make_vector_matrix(target_words, target_vectors, vect_size)