Пример #1
0
def get_word_vector(data, model):
    t1 = time.time()
    print("Reading")
    with open(data, 'r') as f:
        tokens = tokenize(f.read())
    t2 = time.time()
    print("Read TIME: " + str(t2 - t1))
    print("Read NUM : " + str(len(tokens)))
    f = load_model(model)
    # This is not equivalent to piping the data into
    # print-word-vector, because the data is tokenized
    # first.
    t3 = time.time()
    i = 0
    for t in tokens:
        vec = f.get_word_vector(t)
        i += 1
        if i % 10000 == 0:
            sys.stderr.write("\ri: " + str(float(i / len(tokens))))
            sys.stderr.flush()
    t4 = time.time()
    print("\nVectoring: " + str(t4 - t3))
Пример #2
0
        help="Model to use",
    )
    parser.add_argument(
        "question_words",
        help="word questions similar to tmikolov's file (see help for link)",
    )
    parser.add_argument(
        "threshold",
        help="threshold used to limit number of words used",
    )
    args = parser.parse_args()
    args.threshold = int(args.threshold)

    # Retrieve list of normalized word vectors for the first words up
    # until the threshold count.
    f = load_model(args.model)
    # Gets words with associated frequeny sorted by default by descending order
    words, freq = f.get_words(include_freq=True)
    words = words[:args.threshold]
    vectors = np.zeros((len(words), f.get_dimension()), dtype=float)
    for i in range(len(words)):
        wv = f.get_word_vector(words[i])
        wv = wv / np.linalg.norm(wv)
        vectors[i] = wv

    total_correct = 0
    total_qs = 0
    total_num_lines = 0

    total_se_correct = 0
    total_se_qs = 0
Пример #3
0
 def __init__(self, model_path):
     self.model = load_model(model_path)
     input_matrix = self.model.get_input_matrix()
     input_matrix_shape = input_matrix.shape
     super().__init__(input_matrix_shape[0], input_matrix_shape[1])
     self.weight.data.copy_(torch.FloatTensor(input_matrix))
Пример #4
0
        word_offsets = word_offsets[:-1]
        ind = Variable(torch.LongTensor(word_subinds))
        offsets = Variable(torch.LongTensor(word_offsets))
        return super().forward(ind, offsets)


def random_word(N):
    return ''.join(
        random.choices(string.ascii_uppercase + string.ascii_lowercase +
                       string.digits,
                       k=N))


if __name__ == "__main__":
    ft_emb = FastTextEmbeddingBag("fil9.bin")
    model = load_model("fil9.bin")
    num_lines = 200
    total_seconds = 0.0
    total_words = 0
    for _ in range(num_lines):
        words = [
            random_word(random.randint(1, 10))
            for _ in range(random.randint(15, 25))
        ]
        total_words += len(words)
        words_average_length = sum([len(word) for word in words]) / len(words)
        start = time.clock()
        words_emb = ft_emb(words)
        total_seconds += (time.clock() - start)
        for i in range(len(words)):
            word = words[i]