예제 #1
0
def main():
  args = setupArgs()
  train_data, train_labels = utils.loadEmbeddings(args.train_data)
  val_data, val_labels = utils.loadEmbeddings(args.val_data)
  test_data, test_labels = excludeTrainClasses(*utils.loadEmbeddings(args.test_data), train_labels)
  test_data, test_labels = utils.loadEmbeddings(args.test_data)
  clf = model.NearestCentroidSVM()
  clf.verbose = True
  predict_kwargs = {
    'mode' : 'average'
  }
  print("Training {} classifier.".format(clf.__class__))
  false_pos, true_pos = evaluateClassifier(train_data, val_data, test_data, clf, predict_kwargs)
  plotROC(false_pos, true_pos, description = args.title)
예제 #2
0
def test(test_data, labels, model, device, batch=1, training=0, embeddings=None):
    model.eval()

    if embeddings == None:
        embeddings = loadEmbeddings(model.vocab, model.embed_size,
                                './data/word2vec.6B.100d.txt')

    count, correct_count = 0, 0
    with torch.no_grad():
        for test_x, test_y in batch_iter(test_data, labels, batch):

            test_x = model.vocab.to_input_tensor(test_x)
            test_x = embeddings(test_x).to(device)

            output = model.search(test_x)

            test_y = test_y[0]
            for i in range(len(test_y)):
                count += 1
                if test_y[i] == output[i]:
                    correct_count += 1

        correct_rate = 1.*correct_count/count
        print('the corrent rate is : ', correct_rate)

    if training:
        model.train()
    return correct_rate
예제 #3
0
  def __init__(self, queryEmbedFile, docEmbedFile, exclusive, debugPrint=False, useIDF=True):
    super().__init__(exclusive)

    self.debugPrint = debugPrint
    self.useIDF = useIDF

    print('Loading answer embeddings from: ' + docEmbedFile)
    answWords, self.answEmbed = loadEmbeddings(docEmbedFile)
    self.answEmbedMap = createEmbedMap(answWords)

    if queryEmbedFile is not None:
      print('Loading query embeddings from: ' + queryEmbedFile)
      queryWords, self.queryEmbed = loadEmbeddings(queryEmbedFile)
      self.queryEmbedMap = createEmbedMap(queryWords)
    else:
      self.queryEmbed = self.answEmbed
      self.queryEmbedMap = self.answEmbedMap
    print('Loading is done!')
예제 #4
0
def main():
    args = setupArgs()
    feature_data, labels_dict = utils.loadEmbeddings(args.embeddings)
    makeProjection(feature_data)
def main():
    args = setupArgs()
    train_feature_data, train_labels_dict = utils.loadEmbeddings(args.train)
    test_feature_data, test_labels_dict = utils.loadEmbeddings(args.test)
    makeProjection(train_feature_data, test_feature_data)
def main():
    args = setupArgs()
    feature_data, labels_dict = utils.loadEmbeddings(args.embeddings)
    evaluateEmbeddingsKNN(feature_data)
    evaluateEmbeddingsKMeans(feature_data)
    evaluateEmbeddingsAffinityProp(feature_data)
예제 #7
0
print('# of classes:    ' + str(corpus.classVoc.size()))
print()
print('# of training samples: ' + str(len(corpus.trainData)))
print('# of dev samples:      ' + str(len(corpus.devData)))
print()

embedding = Embedding(corpus.voc.size(), corpus.charVoc.size(), embedDim,
                      charDim)
tagger = Tagger(embedDim + charDim, hiddenDim, corpus.classVoc.size(),
                inputDropoutRate, outputDropoutRate)

if not test and not args.random:
    if os.path.exists(wordParamsFile):
        embedding.wordEmbedding.load_state_dict(torch.load(wordParamsFile))
    else:
        utils.loadEmbeddings(embedding.wordEmbedding, corpus.voc,
                             wordEmbeddingFile)
        torch.save(embedding.wordEmbedding.state_dict(), wordParamsFile)

    if os.path.exists(charParamsFile):
        embedding.charEmbedding.load_state_dict(torch.load(charParamsFile))
    else:
        utils.loadEmbeddings(embedding.charEmbedding, corpus.charVoc,
                             charEmbeddingFile)
        torch.save(embedding.charEmbedding.state_dict(), charParamsFile)
if test:
    tagger.load_state_dict(torch.load(taggerParamsFile))
    embedding.load_state_dict(torch.load(embeddingParamsFile))

if useGpu:
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpuId)
예제 #8
0
def train(train_data, train_labels, test_data, test_labels,
          model, optimizer, device, epoch, batch, params_path, corate=-1):
    model.train()

    log_every = 10
    save_iter = 120

    embeddings = loadEmbeddings(model.vocab, model.embed_size,
                                './data/word2vec.6B.100d.txt')

    loss_data = []
    hint_correct_rate = [corate]
    s = time.time()

    for times in range(epoch):
        train_iter = 0

        for train_x, train_y in batch_iter(train_data, train_labels, batch):

            optimizer.zero_grad()

            sum_loss = torch.zeros(1,).to(device)
            m = len(train_y)
            for idx in range(m):

                x = model.vocab.to_input_tensor([train_x[idx]])
                x = embeddings(x).to(device)
                y = torch.tensor([train_y[idx]], device=device)

                loss = model(x, y)
                sum_loss = sum_loss+loss

            sum_loss = 1./m*sum_loss
            sum_loss.backward()
            optimizer.step()

            if train_iter == 0:
                loss_data.append(sum_loss.item())
            if train_iter % log_every == 0:
                e = time.time()
                print("the {} epoch, the {} iter, loss is : {} [time is : {}]".format(times,
                                                                                      train_iter, sum_loss.item(), (e-s)/60.))

            if train_iter == save_iter:
                # auto save params
                correct_rate = test(test_data, test_labels, model, device, training=1, embeddings=embeddings)
                if correct_rate > max(hint_correct_rate):
                    checkpoint = {
                        'model_dict': model.state_dict(),
                        'corate': correct_rate
                    }
                    torch.save(checkpoint, params_path)
                    print('save params')
                else:
                    print('not save params')
                hint_correct_rate.append(correct_rate)

            train_iter += 1

    loss_curve(loss_data)
    print(hint_correct_rate)
    # load best model
    state = torch.load(params_path)
    model.load_state_dict(state['model_dict'])