def get_word_vector(data, model): t1 = time.time() print("Reading") with open(data, 'r') as f: tokens = tokenize(f.read()) t2 = time.time() print("Read TIME: " + str(t2 - t1)) print("Read NUM : " + str(len(tokens))) f = load_model(model) # This is not equivalent to piping the data into # print-word-vector, because the data is tokenized # first. t3 = time.time() i = 0 for t in tokens: vec = f.get_word_vector(t) i += 1 if i % 10000 == 0: sys.stderr.write("\ri: " + str(float(i / len(tokens)))) sys.stderr.flush() t4 = time.time() print("\nVectoring: " + str(t4 - t3))
help="Model to use", ) parser.add_argument( "question_words", help="word questions similar to tmikolov's file (see help for link)", ) parser.add_argument( "threshold", help="threshold used to limit number of words used", ) args = parser.parse_args() args.threshold = int(args.threshold) # Retrieve list of normalized word vectors for the first words up # until the threshold count. f = load_model(args.model) # Gets words with associated frequeny sorted by default by descending order words, freq = f.get_words(include_freq=True) words = words[:args.threshold] vectors = np.zeros((len(words), f.get_dimension()), dtype=float) for i in range(len(words)): wv = f.get_word_vector(words[i]) wv = wv / np.linalg.norm(wv) vectors[i] = wv total_correct = 0 total_qs = 0 total_num_lines = 0 total_se_correct = 0 total_se_qs = 0
def __init__(self, model_path): self.model = load_model(model_path) input_matrix = self.model.get_input_matrix() input_matrix_shape = input_matrix.shape super().__init__(input_matrix_shape[0], input_matrix_shape[1]) self.weight.data.copy_(torch.FloatTensor(input_matrix))
word_offsets = word_offsets[:-1] ind = Variable(torch.LongTensor(word_subinds)) offsets = Variable(torch.LongTensor(word_offsets)) return super().forward(ind, offsets) def random_word(N): return ''.join( random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k=N)) if __name__ == "__main__": ft_emb = FastTextEmbeddingBag("fil9.bin") model = load_model("fil9.bin") num_lines = 200 total_seconds = 0.0 total_words = 0 for _ in range(num_lines): words = [ random_word(random.randint(1, 10)) for _ in range(random.randint(15, 25)) ] total_words += len(words) words_average_length = sum([len(word) for word in words]) / len(words) start = time.clock() words_emb = ft_emb(words) total_seconds += (time.clock() - start) for i in range(len(words)): word = words[i]