Exemplo n.º 1
0
def main():

    random.seed(123)
    np.random.seed(123)
    torch.manual_seed(123)

    parser = argparse.ArgumentParser()

    parser.add_argument('--cuda',
                        default=None,
                        type=int,
                        required=True,
                        help='Selected CUDA.')
    parser.add_argument('--batch_size',
                        default=None,
                        type=int,
                        required=True,
                        help='Batch size.')
    parser.add_argument('--finetuned',
                        default=False,
                        action='store_true',
                        help='Using finetuned BERT model.')
    parser.add_argument('--lexicon',
                        default=None,
                        type=str,
                        required=True,
                        help='Lexicon setting')
    parser.add_argument('--lr',
                        default=None,
                        type=str,
                        required=True,
                        help='Learning rate.')

    args = parser.parse_args()

    print('Lexicon setting: {}'.format(args.lexicon))
    print('Batch size: {}'.format(args.batch_size))
    print('Finetuned: {}'.format(args.finetuned))
    print('Learning rate: {}'.format(args.lr))

    # Define path to data
    inpath = str(Path('../../data/final').resolve())

    if args.lexicon == 'shared':
        test_path = '{}{}sents_vyl_test.txt'.format(inpath, os.sep)

    elif args.lexicon == 'split':
        test_path = '{}{}sents_vyl_test_split.txt'.format(inpath, os.sep)

    # Initialize val loader
    print('Load validation data...')
    test_data = AffixDataset(test_path)
    test_loader = DataLoader(test_data,
                             batch_size=args.batch_size,
                             collate_fn=collate_sents)

    tok = BertTokenizer.from_pretrained('bert-base-uncased')

    # Define device
    device = torch.device(
        'cuda:{}'.format(args.cuda) if torch.cuda.is_available() else 'cpu')

    # Initialize model
    affix_predictor = AffixPredictor('sfx', freeze=False)

    # Load finetuned model weights
    if args.finetuned:
        print(
            'Loading finetuned model weights from model_bert_vyl_{}_{}.torch...'
            .format(args.lexicon, args.lr))
        affix_predictor.load_state_dict(
            torch.load('trained_vyl/model_bert_vyl_{}_{}.torch'.format(
                args.lexicon, args.lr),
                       map_location=device))

    # Move model to CUDA
    affix_predictor = affix_predictor.to(device)

    # Initialize affix list
    afxes = []
    with open(str(Path('../../data/external/affixes_vyl.txt').resolve()),
              'r') as f:
        for l in f:
            # Exclude affixes not in BERT vocabulary
            if l.strip() == '' or l.strip() in {'NULL', 'orium', 'tude'}:
                continue
            afxes.append(l.strip().lower())

    # Add affixes not in BERT as unused tokens
    afxes = ['[unused96]', '[unused97]', '[unused98]'
             ] + ['##' + afx for afx in afxes]
    idxes_afx, _ = torch.sort(
        torch.tensor(tok.convert_tokens_to_ids(afxes)).to(device))

    print('Evaluating model...')

    y_true = list()
    y_pred = list()

    affix_predictor.eval()

    with torch.no_grad():

        for batch in test_loader:

            sents, masks, segs, idxes_mask, labels = batch

            sents, masks, segs, idxes_mask = sents.to(device), masks.to(
                device), segs.to(device), idxes_mask.to(device)

            # Forward pass
            output = affix_predictor(sents, masks, segs, idxes_mask)

            # Filter affixes
            output_afx = torch.index_select(output, -1, idxes_afx)

            # Rank predictions
            vals_afx, preds_afx = torch.topk(output_afx,
                                             k=output_afx.size(-1),
                                             dim=-1)

            labels = labels.to(device)

            # Store labels and predictions
            y_true.extend([l.item() for l in labels])
            y_pred.extend([[idxes_afx[p].item() for p in list(l)][0]
                           for l in preds_afx])

            # Delete tensors to free memory
            del sents, masks, segs, idxes_mask, labels, output

    acc = len([1 for t, p in zip(y_true, y_pred) if t == p]) / len(y_true)

    if args.finetuned:
        with open('results_final/results_vyl_bert_finetuned.txt', 'a+') as f:
            f.write('{:.3f} & '.format(acc))
    else:
        with open('results_final/results_vyl_bert_basic.txt', 'a+') as f:
            f.write('{:.3f} & '.format(acc))
Exemplo n.º 2
0
def main():

    random.seed(123)
    np.random.seed(123)
    torch.manual_seed(123)

    parser = argparse.ArgumentParser()

    parser.add_argument('--cuda',
                        default=None,
                        type=int,
                        required=True,
                        help='Selected CUDA.')
    parser.add_argument('--batch_size',
                        default=None,
                        type=int,
                        required=True,
                        help='Batch size.')

    args = parser.parse_args()

    best_models = [('pfx', 1), ('pfx', 2), ('pfx', 4), ('pfx', 8), ('pfx', 16),
                   ('pfx', 32), ('pfx', 64)]

    for bm in best_models:

        print('Mode: {}'.format(bm[0]))
        print('Count: {}'.format(bm[1]))

        print('Batch size: {}'.format(args.batch_size))

        # Define path to data
        inpath = str(Path('../../data/final').resolve())

        test_path = '{}{}sents_{:02d}_test.txt'.format(inpath, os.sep, bm[1])

        # Initialize val loader
        print('Load validation data...')
        try:
            test_data = AffixDataset(test_path, bm[0])
        except FileNotFoundError:
            print('Bin not found.')
            continue

        test_loader = DataLoader(test_data,
                                 batch_size=args.batch_size,
                                 collate_fn=collate_sents)

        tok = BertTokenizer.from_pretrained('bert-base-uncased')

        # Define device
        device = torch.device('cuda:{}'.format(args.cuda) if torch.cuda.
                              is_available() else 'cpu')

        # Initialize model
        affix_predictor = AffixPredictor(bm[0], freeze=False)

        # Move model to CUDA
        affix_predictor = affix_predictor.to(device)

        mode2afx = {'pfx': 'prefixes', 'sfx': 'suffixes'}

        # Initialize affix list
        afxes = []
        with open(
                str(
                    Path('../../data/external/bert_{}.txt'.format(
                        mode2afx[bm[0]])).resolve()), 'r') as f:
            for l in f:
                if l.strip() == '' or l.strip() == 'abil':
                    continue
                afxes.append(l.strip().lower())

        if bm[0] == 'pfx':
            idxes_afx, _ = torch.sort(
                torch.tensor(tok.convert_tokens_to_ids(afxes)).to(device))

        elif bm[0] == 'sfx':
            idxes_afx, _ = torch.sort(
                torch.tensor(
                    tok.convert_tokens_to_ids(['##' + a
                                               for a in afxes])).to(device))

        print('Evaluating model...')

        y_true = list()
        y_pred = list()
        bases = list()

        affix_predictor.eval()

        with torch.no_grad():

            for batch in test_loader:

                sents, masks, segs, idxes_mask, labels = batch

                if bm[0] == 'pfx':
                    # Prefix: base one step to the right
                    bases.extend(sents[torch.arange(sents.size(0)),
                                       idxes_mask + 1].tolist())
                elif bm[0] == 'sfx':
                    # Suffix: base one step to the left
                    bases.extend(sents[torch.arange(sents.size(0)),
                                       idxes_mask - 1].tolist())

                sents, masks, segs, idxes_mask = sents.to(device), masks.to(
                    device), segs.to(device), idxes_mask.to(device)

                # Forward pass
                output = affix_predictor(sents, masks, segs, idxes_mask)

                # Filter affixes
                output_afx = torch.index_select(output, -1, idxes_afx)

                # Rank predictions
                vals_afx, preds_afx = torch.topk(output_afx,
                                                 k=output_afx.size(-1),
                                                 dim=-1)

                labels = labels.to(device)

                # Store labels and predictions
                y_true.extend([l.item() for l in labels])
                y_pred.extend([[idxes_afx[p].item() for p in list(l)]
                               for l in preds_afx])

                # Delete tensors to free memory
                del sents, masks, segs, idxes_mask, labels, output

        with open('results_final/results_tok_macro.txt', 'a+') as f:
            f.write('{:.3f} & '.format(
                np.mean(list(mrr_macro(y_true, y_pred, 10).values()))))

        with open('results_final/results_tok_micro.txt', 'a+') as f:
            f.write('{:.3f} & '.format(mrr_micro(y_true, y_pred, 10)))
Exemplo n.º 3
0
def main():

    random.seed(123)
    np.random.seed(123)
    torch.manual_seed(123)

    parser = argparse.ArgumentParser()

    parser.add_argument('--cuda',
                        default=None,
                        type=int,
                        required=True,
                        help='Selected CUDA.')
    parser.add_argument('--batch_size',
                        default=None,
                        type=int,
                        required=True,
                        help='Batch size.')
    parser.add_argument('--finetuned',
                        default=False,
                        action='store_true',
                        help='Using finetuned BERT model.')

    args = parser.parse_args()

    best_models = [
        ('pfx', 1, 'shared', '1e-05'), ('pfx', 2, 'shared', '3e-05'),
        ('pfx', 4, 'shared', '3e-05'), ('pfx', 8, 'shared', '3e-05'),
        ('pfx', 16, 'shared', '1e-05'), ('pfx', 32, 'shared', '3e-06'),
        ('pfx', 64, 'shared', '3e-06'), ('pfx', 1, 'split', '3e-06'),
        ('pfx', 2, 'split', '1e-05'), ('pfx', 4, 'split', '3e-06'),
        ('pfx', 8, 'split', '1e-06'), ('pfx', 16, 'split', '3e-06'),
        ('pfx', 32, 'split', '1e-06'), ('pfx', 64, 'split', '1e-06'),
        ('sfx', 1, 'shared', '3e-05'), ('sfx', 2, 'shared', '3e-05'),
        ('sfx', 4, 'shared', '3e-05'), ('sfx', 8, 'shared', '3e-05'),
        ('sfx', 16, 'shared', '1e-05'), ('sfx', 32, 'shared', '1e-05'),
        ('sfx', 64, 'shared', '3e-06'), ('sfx', 1, 'split', '3e-05'),
        ('sfx', 2, 'split', '1e-05'), ('sfx', 4, 'split', '3e-06'),
        ('sfx', 8, 'split', '1e-06'), ('sfx', 16, 'split', '1e-06'),
        ('sfx', 32, 'split', '1e-06'), ('sfx', 64, 'split', '1e-06'),
        ('both', 1, 'shared', '1e-05'), ('both', 2, 'shared', '3e-05'),
        ('both', 4, 'shared', '3e-05'), ('both', 8, 'shared', '3e-05'),
        ('both', 16, 'shared', '1e-05'), ('both', 32, 'shared', '3e-05'),
        ('both', 64, 'shared', '1e-05'), ('both', 1, 'split', '1e-05'),
        ('both', 2, 'split', '3e-06'), ('both', 4, 'split', '3e-06'),
        ('both', 8, 'split', '1e-06'), ('both', 16, 'split', '1e-06'),
        ('both', 32, 'split', '1e-06'), ('both', 64, 'split', '1e-06')
    ]

    for bm in best_models:

        print('Mode: {}'.format(bm[0]))
        print('Count: {}'.format(bm[1]))
        print('Lexicon setting: {}'.format(bm[2]))
        print('Learning rate: {}'.format(bm[3]))

        print('Batch size: {}'.format(args.batch_size))
        print('Finetuned: {}'.format(args.finetuned))

        # Define path to data
        inpath = str(Path('../../data/final').resolve())

        if bm[2] == 'shared':
            test_path = '{}{}sents_{:02d}_test.txt'.format(
                inpath, os.sep, bm[1])

        elif bm[2] == 'split':
            test_path = '{}{}sents_{:02d}_test_split.txt'.format(
                inpath, os.sep, bm[1])

        # Initialize val loader
        print('Load validation data...')
        try:
            test_data = AffixDataset(test_path, bm[0])
        except FileNotFoundError:
            print('Bin not found.')
            continue

        test_loader = DataLoader(test_data,
                                 batch_size=args.batch_size,
                                 collate_fn=collate_sents)

        tok = BertTokenizer.from_pretrained('bert-base-uncased')

        # Define device
        device = torch.device('cuda:{}'.format(args.cuda) if torch.cuda.
                              is_available() else 'cpu')

        # Initialize model
        affix_predictor = AffixPredictor(bm[0], freeze=False)

        # Load finetuned model weights
        if args.finetuned:
            print(
                'Loading finetuned model weights from model_bert_{}_{}_{:02d}_{}.torch...'
                .format(bm[2], bm[0], bm[1], bm[3]))
            affix_predictor.load_state_dict(
                torch.load(
                    'trained_main/model_bert_{}_{}_{:02d}_{}.torch'.format(
                        bm[2], bm[0], bm[1], bm[3]),
                    map_location=device))

        # Move model to CUDA
        affix_predictor = affix_predictor.to(device)

        if bm[0] == 'pfx' or bm[0] == 'sfx':
            mrr_micro, mrr_macro_dict = test_single(test_loader,
                                                    affix_predictor, bm[0],
                                                    args.cuda)
        elif bm[0] == 'both':
            mrr_micro, mrr_macro_dict = test_both(test_loader, affix_predictor,
                                                  args.cuda)

        if args.finetuned:
            with open(
                    'results_final/results_bert_{}_{}_finetuned_macro.txt'.
                    format(bm[2], bm[0]), 'a+') as f:
                f.write('{:.3f} & '.format(
                    np.mean(list(mrr_macro_dict.values()))))
            with open(
                    'results_final/results_bert_{}_{}_finetuned_micro.txt'.
                    format(bm[2], bm[0]), 'a+') as f:
                f.write('{:.3f} & '.format(mrr_micro))
        else:
            with open(
                    'results_final/results_bert_{}_{}_basic_macro.txt'.format(
                        bm[2], bm[0]), 'a+') as f:
                f.write('{:.3f} & '.format(
                    np.mean(list(mrr_macro_dict.values()))))
            with open(
                    'results_final/results_bert_{}_{}_basic_micro.txt'.format(
                        bm[2], bm[0]), 'a+') as f:
                f.write('{:.3f} & '.format(mrr_micro))
Exemplo n.º 4
0
def main():

    random.seed(123)
    np.random.seed(123)
    torch.manual_seed(123)

    parser = argparse.ArgumentParser()

    parser.add_argument('--count',
                        default=None,
                        type=int,
                        required=True,
                        help='Count of derivatives.')
    parser.add_argument('--lexicon',
                        default=None,
                        type=str,
                        required=True,
                        help='Lexicon setting')
    parser.add_argument('--mode',
                        default=None,
                        type=str,
                        required=True,
                        help='Affix type.')
    parser.add_argument('--batch_size',
                        default=None,
                        type=int,
                        required=True,
                        help='Batch size.')
    parser.add_argument('--lr',
                        default=None,
                        type=float,
                        required=True,
                        help='Learning rate.')
    parser.add_argument('--n_epochs',
                        default=None,
                        type=int,
                        required=True,
                        help='Number of epochs.')
    parser.add_argument('--cuda',
                        default=None,
                        type=int,
                        required=True,
                        help='Selected CUDA.')
    parser.add_argument('--freeze',
                        default=False,
                        action='store_true',
                        help='Freeze BERT parameters.')

    args = parser.parse_args()

    print('Mode: {}'.format(args.mode))
    print('Lexicon setting: {}'.format(args.lexicon))
    print('Count: {}'.format(args.count))
    print('Batch size: {}'.format(args.batch_size))
    print('Learning rate: {}'.format(args.lr))
    print('Number of epochs: {}'.format(args.n_epochs))
    print('Freeze: {}'.format(args.freeze))

    # Define poath to data
    inpath = str(Path('../../data/final').resolve())

    if args.lexicon == 'shared':
        train_path = '{}{}sents_{:02d}_train.txt'.format(
            inpath, os.sep, args.count)
        val_path = '{}{}sents_{:02d}_dev.txt'.format(inpath, os.sep,
                                                     args.count)

    elif args.lexicon == 'split':
        train_path = '{}{}sents_{:02d}_train_split.txt'.format(
            inpath, os.sep, args.count)
        val_path = '{}{}sents_{:02d}_dev_split.txt'.format(
            inpath, os.sep, args.count)

    # Initialize train loader
    print('Load training data...')
    train_data = AffixDataset(train_path, args.mode)
    train_loader = DataLoader(train_data,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_sents)

    # Initialize val loader
    print('Load validation data...')
    val_data = AffixDataset(val_path, args.mode)
    val_loader = DataLoader(val_data,
                            batch_size=args.batch_size,
                            collate_fn=collate_sents)

    # Initialize model
    affix_predictor = AffixPredictor(args.mode, freeze=args.freeze)

    train(train_loader, val_loader, affix_predictor, args.mode, args.lr,
          args.n_epochs, args.cuda, args.count, args.lexicon, args.freeze)
Exemplo n.º 5
0
def main():

    random.seed(123)
    np.random.seed(123)
    torch.manual_seed(123)

    parser = argparse.ArgumentParser()

    parser.add_argument('--cuda', default=None, type=int, required=True, help='Selected CUDA.')
    parser.add_argument('--batch_size', default=None, type=int, required=True, help='Batch size.')

    args = parser.parse_args()

    models = [
        ('pfx', 1),
        ('pfx', 2),
        ('pfx', 4),
        ('pfx', 8),
        ('pfx', 16),
        ('pfx', 32),
        ('pfx', 64)
    ]

    for m in models:

        print('Mode: {}'.format(m[0]))
        print('Count: {}'.format(m[1]))

        print('Batch size: {}'.format(args.batch_size))

        # Define path to data
        inpath = str(Path('../../data/final').resolve())

        test_path = '{}{}sents_{:02d}_test.txt'.format(inpath, os.sep, m[1])

        # Initialize val loader
        print('Load validation data...')
        try:
            test_data = AffixDataset(test_path, m[0])
        except FileNotFoundError:
            print('Bin not found.')
            continue

        test_loader = DataLoader(test_data, batch_size=args.batch_size, collate_fn=collate_sents)

        tok = BertTokenizer.from_pretrained('bert-base-uncased')

        # Define device
        device = torch.device('cuda:{}'.format(args.cuda) if torch.cuda.is_available() else 'cpu')

        # Initialize model
        affix_predictor = AffixPredictor(m[0], freeze=False)

        # Move model to CUDA
        affix_predictor = affix_predictor.to(device)

        mrr_micro, mrr_macro_dict = test_single(test_loader, affix_predictor, m[0], args.cuda)

        with open('results_final/results_hyp_macro.txt', 'a+') as f:
            f.write('{:.3f} & '.format(np.mean(list(mrr_macro_dict.values()))))
        with open('results_final/results_hyp_micro.txt', 'a+') as f:
            f.write('{:.3f} & '.format(mrr_micro))