예제 #1
0
def predict_on_iob2(model, iob_url):
    """ predict on iob2 file and save the results

    Args:
        model: trained model
        iob_url: url to iob file

    """

    save_url = iob_url.replace('.iob2', '.pred.txt')
    print("predicting on {} \n the result will be saved in {}".format(
        iob_url, save_url))
    test_set = ExhaustiveDataset(iob_url, device=next(
        model.parameters()).device)

    model.eval()
    with open(save_url, 'w', encoding='utf-8', newline='\n') as save_file:
        for sentence, records in test_set:
            save_file.write(' '.join(sentence) + '\n')
            save_file.write("length = {} \n".format(len(sentence)))
            save_file.write("Gold: {}\n".format(str(records)))
            pred_result = str(predict(model, [sentence], test_set.categories, iob_url)[0])
            save_file.write("Pred: {}\n\n".format(pred_result))
예제 #2
0
def train(n_epochs=30,
          embedding_url=None,
          char_feat_dim=50,
          freeze=False,
          train_url=TRAIN_URL,
          dev_url=DEV_URL,
          test_url=None,
          max_region=10,
          learning_rate=0.001,
          batch_size=100,
          early_stop=5,
          clip_norm=5,
          device='auto',
          save_only_best = True
          ):
    """ Train deep exhaustive model, Sohrab et al. 2018 EMNLP

    Args:
        n_epochs: number of epochs
        embedding_url: url to pretrained embedding file, set as None to use random embedding
        char_feat_dim: size of character level feature
        freeze: whether to freeze embedding
        train_url: url to train data
        dev_url: url to dev data
        test_url: url to test data for evaluating, set to None for not evaluating
        max_region: max entity region size
        learning_rate: learning rate
        batch_size: batch_size
        early_stop: early stop for training
        clip_norm: whether to perform norm clipping, set to 0 if not need
        device: device for torch
        save_only_best: only save model of best performance
    """

    # print arguments
    arguments = json.dumps(vars(), indent=2)
    print("exhaustive model is training with arguments", arguments)
    device = get_device(device)

    train_set = ExhaustiveDataset(train_url, device=device, max_region=max_region)
    train_loader = DataLoader(train_set, batch_size=batch_size, drop_last=False,
                              collate_fn=train_set.collate_func)

    vocab = ju.load(VOCAB_URL)
    n_words = len(vocab)
    char_vocab = ju.load(VOCAB_URL.replace('vocab', 'char_vocab'))
    n_chars = len(char_vocab)

    model = ExhaustiveModel(
        hidden_size=200,
        n_tags=train_set.n_tags + 1,
        char_feat_dim=char_feat_dim,
        embedding_url=embedding_url,
        bidirectional=True,
        max_region=max_region,
        n_embeddings=n_words,
        n_chars = n_chars,
        embedding_dim=200,
        freeze=freeze
    )

    if device.type == 'cuda':
        print("using gpu,", torch.cuda.device_count(), "gpu(s) available!\n")
        # model = nn.DataParallel(model)
    else:
        print("using cpu\n")
    model = model.to(device)

    criterion = F.cross_entropy
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    max_f1, max_f1_epoch, cnt = 0, 0, 0
    # ignore the padding part when calcuting loss
    tag_weights = torch.Tensor([1] * train_set.n_tags + [0]).to(device)
    best_model_url = None

    # train and evaluate model
    for epoch in range(n_epochs):
        # switch to train mode
        model.train()
        batch_id = 0
        for data, labels, _ in train_loader:
            optimizer.zero_grad()
            outputs = model.forward(*data)
            # use weight parameter to skip padding part
            loss = criterion(outputs, labels, weight=tag_weights)
            loss.backward()
            # gradient clipping
            if clip_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            optimizer.step()

            endl = '\n' if batch_id % LOG_PER_BATCH == 0 else '\r'
            sys.stdout.write("epoch #%d, batch #%d, loss: %.6f, %s%s" %
                             (epoch, batch_id, loss.item(), datetime.now().strftime("%X"), endl))
            sys.stdout.flush()
            batch_id += 1

        cnt += 1
        # metrics on development set
        dev_metrics = evaluate(model, dev_url)
        if dev_metrics['f1'] > max_f1:
            max_f1 = dev_metrics['f1']
            max_f1_epoch = epoch
            if save_only_best and best_model_url:
                os.remove(best_model_url)
            best_model_url = from_project_root(
                "data/model/exhaustive_model_epoch%d_%f.pt" % (epoch, max_f1))
            torch.save(model, best_model_url)
            cnt = 0

        print("maximum of f1 value: %.6f, in epoch #%d\n" % (max_f1, max_f1_epoch))
        if cnt >= early_stop > 0:
            break
    print('\n')

    if test_url and best_model_url:
        model = torch.load(best_model_url)
        print("best model url:", best_model_url)
        print("evaluating on test dataset:", test_url)
        evaluate(model, test_url)

    print(arguments)
        'where to load train_test_split paths [default: ./train_test_split/1]')
    parser.add_argument('--out-dir',
                        type=str,
                        default='./',
                        help='where to dump files [default: ./]')
    parser.add_argument('--cuda', action='store_true', default=False)
    args = parser.parse_args()
    args.cuda = args.cuda and torch.cuda.is_available()

    model = load_checkpoint(args.model_path, use_cuda=args.cuda)
    model = model.eval()
    if model.cuda:
        model = model.cuda()

    dataset = ExhaustiveDataset(adaptor=model.adaptor,
                                split='test',
                                train_test_split_dir=args.train_test_split_dir)
    loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)

    dist_jsons = defaultdict(lambda: {})
    pbar = tqdm(total=len(loader))
    test_sketchpaths = []

    for batch_idx, (sketch, sketch_object, sketch_context,
                    sketch_path) in enumerate(loader):
        sketch_name = sketch_uname(sketch_path[0])
        test_sketchpaths.append(os.path.basename(sketch_path[0]))
        sketch = Variable(sketch, volatile=True)
        if args.cuda:
            sketch = sketch.cuda()
        photo_generator = dataset.gen_photos()
예제 #4
0
def evaluate(model, data_url):
    """ eval model on specific dataset

    Args:
        model: model to evaluate
        data_url: url to data for evaluating

    Returns:
        metrics on dataset

    """
    print("\nEvaluating model use data from ", data_url, "\n")
    max_region = model.max_region
    dataset = ExhaustiveDataset(data_url,
                                next(model.parameters()).device,
                                max_region=max_region)
    data_loader = DataLoader(dataset,
                             batch_size=1,
                             collate_fn=dataset.collate_func)
    model.eval()
    region_true_list = list()
    region_pred_list = list()
    count = 0
    count_test = 0
    # switch to eval mode
    with torch.no_grad():
        for data, labels, records_list in data_loader:
            batch_region_labels = torch.argmax(model.forward(*data),
                                               dim=1).cpu()
            count += len(batch_region_labels[0])
            count_test += len(data[0][0])
            lengths = data[1]
            batch_maxlen = lengths[0]
            for region_labels, length, true_records in zip(
                    batch_region_labels, lengths, records_list):
                pred_records = {}
                ind = 0
                for region_size in range(1, max_region + 1):
                    for start in range(0, batch_maxlen - region_size + 1):
                        end = start + region_size
                        if 0 < region_labels[ind] < 6 and end <= length:
                            pred_records[(start, start +
                                          region_size)] = region_labels[ind]
                        ind += 1

                for region in true_records:
                    true_label = dataset.label_ids[true_records[region]]
                    pred_label = pred_records[
                        region] if region in pred_records else 0
                    region_true_list.append(true_label)
                    region_pred_list.append(pred_label)
                for region in pred_records:
                    if region not in true_records:
                        region_pred_list.append(pred_records[region])
                        region_true_list.append(0)

    print(
        classification_report(region_true_list,
                              region_pred_list,
                              target_names=list(dataset.label_ids)[:6],
                              digits=6))
    print(count)
    print(count_test)
    ret = dict()
    tp = fp = fn = 0
    for pv, tv in zip(region_pred_list, region_true_list):
        if pv == tv:
            tp += 1
        else:  # predict value != true value
            if pv > 0:
                fp += 1
            if tv > 0:
                fn += 1

    ret['precision'], ret['recall'], ret['f1'] = calc_f1(tp, fp, fn)
    return ret