def main(): args = setupArgs() train_data, train_labels = utils.loadEmbeddings(args.train_data) val_data, val_labels = utils.loadEmbeddings(args.val_data) test_data, test_labels = excludeTrainClasses(*utils.loadEmbeddings(args.test_data), train_labels) test_data, test_labels = utils.loadEmbeddings(args.test_data) clf = model.NearestCentroidSVM() clf.verbose = True predict_kwargs = { 'mode' : 'average' } print("Training {} classifier.".format(clf.__class__)) false_pos, true_pos = evaluateClassifier(train_data, val_data, test_data, clf, predict_kwargs) plotROC(false_pos, true_pos, description = args.title)
def test(test_data, labels, model, device, batch=1, training=0, embeddings=None): model.eval() if embeddings == None: embeddings = loadEmbeddings(model.vocab, model.embed_size, './data/word2vec.6B.100d.txt') count, correct_count = 0, 0 with torch.no_grad(): for test_x, test_y in batch_iter(test_data, labels, batch): test_x = model.vocab.to_input_tensor(test_x) test_x = embeddings(test_x).to(device) output = model.search(test_x) test_y = test_y[0] for i in range(len(test_y)): count += 1 if test_y[i] == output[i]: correct_count += 1 correct_rate = 1.*correct_count/count print('the corrent rate is : ', correct_rate) if training: model.train() return correct_rate
def __init__(self, queryEmbedFile, docEmbedFile, exclusive, debugPrint=False, useIDF=True): super().__init__(exclusive) self.debugPrint = debugPrint self.useIDF = useIDF print('Loading answer embeddings from: ' + docEmbedFile) answWords, self.answEmbed = loadEmbeddings(docEmbedFile) self.answEmbedMap = createEmbedMap(answWords) if queryEmbedFile is not None: print('Loading query embeddings from: ' + queryEmbedFile) queryWords, self.queryEmbed = loadEmbeddings(queryEmbedFile) self.queryEmbedMap = createEmbedMap(queryWords) else: self.queryEmbed = self.answEmbed self.queryEmbedMap = self.answEmbedMap print('Loading is done!')
def main(): args = setupArgs() feature_data, labels_dict = utils.loadEmbeddings(args.embeddings) makeProjection(feature_data)
def main(): args = setupArgs() train_feature_data, train_labels_dict = utils.loadEmbeddings(args.train) test_feature_data, test_labels_dict = utils.loadEmbeddings(args.test) makeProjection(train_feature_data, test_feature_data)
def main(): args = setupArgs() feature_data, labels_dict = utils.loadEmbeddings(args.embeddings) evaluateEmbeddingsKNN(feature_data) evaluateEmbeddingsKMeans(feature_data) evaluateEmbeddingsAffinityProp(feature_data)
print('# of classes: ' + str(corpus.classVoc.size())) print() print('# of training samples: ' + str(len(corpus.trainData))) print('# of dev samples: ' + str(len(corpus.devData))) print() embedding = Embedding(corpus.voc.size(), corpus.charVoc.size(), embedDim, charDim) tagger = Tagger(embedDim + charDim, hiddenDim, corpus.classVoc.size(), inputDropoutRate, outputDropoutRate) if not test and not args.random: if os.path.exists(wordParamsFile): embedding.wordEmbedding.load_state_dict(torch.load(wordParamsFile)) else: utils.loadEmbeddings(embedding.wordEmbedding, corpus.voc, wordEmbeddingFile) torch.save(embedding.wordEmbedding.state_dict(), wordParamsFile) if os.path.exists(charParamsFile): embedding.charEmbedding.load_state_dict(torch.load(charParamsFile)) else: utils.loadEmbeddings(embedding.charEmbedding, corpus.charVoc, charEmbeddingFile) torch.save(embedding.charEmbedding.state_dict(), charParamsFile) if test: tagger.load_state_dict(torch.load(taggerParamsFile)) embedding.load_state_dict(torch.load(embeddingParamsFile)) if useGpu: if torch.cuda.is_available(): torch.cuda.set_device(args.gpuId)
def train(train_data, train_labels, test_data, test_labels, model, optimizer, device, epoch, batch, params_path, corate=-1): model.train() log_every = 10 save_iter = 120 embeddings = loadEmbeddings(model.vocab, model.embed_size, './data/word2vec.6B.100d.txt') loss_data = [] hint_correct_rate = [corate] s = time.time() for times in range(epoch): train_iter = 0 for train_x, train_y in batch_iter(train_data, train_labels, batch): optimizer.zero_grad() sum_loss = torch.zeros(1,).to(device) m = len(train_y) for idx in range(m): x = model.vocab.to_input_tensor([train_x[idx]]) x = embeddings(x).to(device) y = torch.tensor([train_y[idx]], device=device) loss = model(x, y) sum_loss = sum_loss+loss sum_loss = 1./m*sum_loss sum_loss.backward() optimizer.step() if train_iter == 0: loss_data.append(sum_loss.item()) if train_iter % log_every == 0: e = time.time() print("the {} epoch, the {} iter, loss is : {} [time is : {}]".format(times, train_iter, sum_loss.item(), (e-s)/60.)) if train_iter == save_iter: # auto save params correct_rate = test(test_data, test_labels, model, device, training=1, embeddings=embeddings) if correct_rate > max(hint_correct_rate): checkpoint = { 'model_dict': model.state_dict(), 'corate': correct_rate } torch.save(checkpoint, params_path) print('save params') else: print('not save params') hint_correct_rate.append(correct_rate) train_iter += 1 loss_curve(loss_data) print(hint_correct_rate) # load best model state = torch.load(params_path) model.load_state_dict(state['model_dict'])