def train_eval(mode, model_file, descriptions_file, neg_words_mult=2., lbda=50, min_words=50, eval_lines=5000, eval_words=10): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum( [model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) rng = random.Random(1729) eval_items = [] def sampled_word_seqs(): for i, (entity, t, word_idxs) in \ enumerate(read_entity_word_seqs(descriptions_file, model, min_words)): rng.shuffle(word_idxs) if i < eval_lines: eval_items.append( (entity, word_idxs[:eval_words], len(word_idxs))) yield entity, t, word_idxs[eval_words:] entity_model.train(model, sampled_word_seqs()) evaluate_retrieval(model, entity_model, eval_items)
def train_eval(mode, model_file, descriptions_file, neg_words_mult=2., lbda=50, min_words=50, eval_lines=5000, eval_words=10): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum([model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) rng = random.Random(1729) eval_items = [] def sampled_word_seqs(): for i, (entity, t, word_idxs) in \ enumerate(read_entity_word_seqs(descriptions_file, model, min_words)): rng.shuffle(word_idxs) if i < eval_lines: eval_items.append((entity, word_idxs[:eval_words], len(word_idxs))) yield entity, t, word_idxs[eval_words:] entity_model.train(model, sampled_word_seqs()) evaluate_retrieval(model, entity_model, eval_items)
def train(mode, model_file, descriptions_file, output_file=None, neg_words_mult=2., lbda=50, min_words=1): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum([model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) entity_model.train(model, read_entity_word_seqs(descriptions_file, model, min_words)) if output_file is not None: entity_model.save(output_file)
def train(mode, model_file, descriptions_file, output_file=None, neg_words_mult=2., lbda=50, min_words=1): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum( [model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) entity_model.train( model, read_entity_word_seqs(descriptions_file, model, min_words)) if output_file is not None: entity_model.save(output_file)