Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', default=None, type=str, required=True)
    parser.add_argument('--w2v_path', default=None, type=str, required=True)
    parser.add_argument('--labels', default=None, type=str, required=True)
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--epochs', default=3, type=int)
    parser.add_argument('--logging_steps', default=20, type=int)
    parser.add_argument('--learning_rate', default=5e-3, type=float)
    args = parser.parse_args()

    args.device = torch.device('cuda')

    labels = get_labels(args.labels)
    glove = GloVe(cache=args.w2v_path)

    # model
    model = LstmCrf(w2v=glove, num_tags=len(labels), hidden_dim=512)
    model.to(args.device)

    # dataset
    train_dataset = NerDataset(args.data_dir, labels, glove, mode='train')
    eval_dataset = NerDataset(args.data_dir, labels, glove, mode='dev')

    # train
    train(args, model, train_dataset)

    # eval
    result = eval(args, model, eval_dataset, labels)

    print(result)
Exemplo n.º 2
0
def build_tasks_from_file(conf_path, options=None):
    if type(conf_path) is str:
        conf = Config.read(conf_path)
    elif type(conf_path) is Config:
        conf = conf_path
    else:
        raise TypeError('Unknown configuration type. Expect str or Config.')

    if options:
        for k, v in options:
            conf.update_value(k, v)

    # Create data sets
    logger.info('Loading data sets')
    datasets = {}
    lang_datasets = defaultdict(list)
    task_datasets = defaultdict(list)
    for dataset in conf.datasets:
        parser = create_parser(dataset.parser.format, dataset.parser)
        (train_conf, dev_conf,
         test_conf) = dataset.clone(), dataset.clone(), dataset.clone()
        train_conf.update({'path': dataset.files.train, 'parser': parser})
        dev_conf.update({
            'path': dataset.files.dev,
            'parser': parser,
            'sample': None
        })
        train_dataset = create_dataset(dataset.type, train_conf)
        dev_dataset = create_dataset(dataset.type, dev_conf)
        if hasattr(dataset.files, 'test'):
            test_conf.update({
                'path': dataset.files.test,
                'parser': parser,
                'sample': None
            })
            test_dataset = create_dataset(dataset.type, test_conf)
        datasets[dataset.name] = {
            'train': train_dataset,
            'dev': dev_dataset,
            'test': test_dataset,
            'language': dataset.language,
            'task': dataset.task
        }
        lang_datasets[dataset.language].append(dataset.name)
        task_datasets[dataset.task].append(dataset.name)

    # Create vocabs
    # I only keep words in the data sets to save memory
    # If the model will be applied to an unknown test set, it is better to keep
    # all words in pre-trained embeddings.
    logger.info('Creating vocabularies')
    dataset_counts = {}
    lang_token_vocabs = {}
    task_label_vocabs = {}
    for name, ds in datasets.items():
        dataset_counts[name] = compute_metadata(
            [ds['train'], ds['dev'], ds['test']])
    for lang, ds in lang_datasets.items():
        counts = [dataset_counts[d][0] for d in ds]
        lang_token_vocabs[lang] = count2vocab(counts,
                                              ignore_case=True,
                                              start_idx=2)
    for task, ds in task_datasets.items():
        counts = [dataset_counts[d][1] for d in ds]
        task_label_vocabs[task] = count2vocab(counts,
                                              ignore_case=False,
                                              start_idx=0,
                                              sort=True)
    char_vocab = count2vocab([c[2] for c in dataset_counts.values()],
                             ignore_case=False,
                             start_idx=1)

    # Report stats
    for lang, vocab in lang_token_vocabs.items():
        logger.info('#{} token: {}'.format(lang, len(vocab)))
    for task, vocab in task_label_vocabs.items():
        logger.info('#{} label: {}'.format(task, len(vocab)))
        logger.info(vocab)

    # Numberize datasets
    logger.info('Numberizing data sets')
    numberize_conf = []
    for ds in datasets.values():
        numberize_conf.append((ds['train'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
        numberize_conf.append((ds['dev'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
        numberize_conf.append((ds['test'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
    numberize_datasets(numberize_conf,
                       token_ignore_case=True,
                       label_ignore_case=False,
                       char_ignore_case=False)

    # Initialize component confs
    logger.info('Initializing component configurations')
    word_embed_dim = char_embed_dim = lstm_output_dim = 0
    cpnt_confs = {}
    for cpnt in conf.components:
        if cpnt.model == 'embedding':
            cpnt.embedding_dim = cpnt.dimension
            word_embed_dim = cpnt.dimension
        elif cpnt.model == 'char_cnn':
            cpnt.vocab_size = len(char_vocab)
            char_embed_dim = sum([x[1] for x in cpnt.filters])
        elif cpnt.model == 'lstm':
            lstm_output_dim = cpnt.hidden_size * (2
                                                  if cpnt.bidirectional else 1)
        cpnt_confs[cpnt.name] = cpnt.clone()

    # Update component configurations
    target_task = ''
    target_lang = ''
    for task_conf in conf.tasks:
        language = task_conf.language
        task = task_conf.task
        if task_conf.get('ref', False):
            target_lang = language
            target_task = task
        model_conf = task_conf.model
        if model_conf.model != 'lstm_crf':
            continue
        # Update word embedding configuration
        cpnt_confs[model_conf.word_embed].num_embeddings = len(
            lang_token_vocabs[language])
        cpnt_confs[model_conf.word_embed].vocab = lang_token_vocabs[language]
        # Update output layer configuration
        cpnt_confs[model_conf.univ_layer].out_features = len(
            task_label_vocabs[task])
        if hasattr(model_conf, 'spec_layer'):
            cpnt_confs[model_conf.spec_layer].out_features = len(
                task_label_vocabs[task])
        # Update CRF configuration
        cpnt_confs[model_conf.crf].label_vocab = task_label_vocabs[task]

    for _, cpnt_conf in cpnt_confs.items():
        if cpnt_conf.model == 'linear' and cpnt_conf.position == 'output':
            cpnt_conf.in_features = lstm_output_dim
        if cpnt_conf.model == 'lstm':
            cpnt_conf.input_size = char_embed_dim + word_embed_dim
        if cpnt_conf.model == 'highway' and cpnt_conf.position == 'char':
            cpnt_conf.size = char_embed_dim

    # Create components
    logger.info('Creating components')
    components = {k: create_module(v.model, v) for k, v in cpnt_confs.items()}

    # Construct models
    tasks = []
    for task_conf in conf.tasks:
        model_conf = task_conf.model
        language = task_conf.language
        task = task_conf.task
        if model_conf.model == 'lstm_crf':
            model = LstmCrf(
                lang_token_vocabs[language],
                task_label_vocabs[task],
                char_vocab,
                word_embedding=components[model_conf.word_embed],
                char_embedding=components[model_conf.char_embed] if hasattr(
                    model_conf, 'char_embed') else None,
                crf=components[model_conf.crf],
                lstm=components[model_conf.lstm],
                input_layer=None,
                univ_fc_layer=components[model_conf.univ_layer],
                spec_fc_layer=components[model_conf.spec_layer] if hasattr(
                    model_conf, 'spec_linear') else None,
                embed_dropout_prob=model_conf.embed_dropout,
                lstm_dropout_prob=model_conf.lstm_dropout,
                linear_dropout_prob=model_conf.linear_dropout,
                char_highway=components[model_conf.char_highway] if hasattr(
                    model_conf, 'char_highway') else None,
                use_char_embedding=model_conf.use_char_embedding if hasattr(
                    model_conf, 'use_char_embedding') else True,
            )
        # elif model_conf.model == 'cbow':
        #     pass
        else:
            raise ValueError('Unknown model: {}'.format(model_conf.model))
        logger.debug(model)

        task_classes = {'ner': NameTagging, 'pos': PosTagging}
        if task in task_classes:
            task_obj = task_classes[task](
                task_conf.name,
                model,
                datasets=datasets[task_conf.dataset],
                vocabs={
                    'token': lang_token_vocabs[language],
                    'label': task_label_vocabs[task],
                    'char': char_vocab
                },
                gpu=task_conf.gpu,
                # TODO: 'gpu' -> global config
                prob=getattr(task_conf, 'prob', 1.0),
                lr=getattr(task_conf, 'learning_rate', .001),
                momentum=getattr(task_conf, 'momentum', .9),
                decay_rate=getattr(task_conf, 'decay_rate', .9),
                decay_step=getattr(task_conf, 'decay_step', 10000),
                gradient_clipping=getattr(task_conf, 'gradient_clipping', 5.0),
                require_eval=getattr(task_conf, 'require_eval', True),
                ref=getattr(task_conf, 'ref', False),
                aux_task=task_conf.task != target_task,
                aux_lang=task_conf.language != target_lang,
            )
        else:
            raise ValueError('Unknown task {}'.format(task))
        tasks.append(task_obj)

    return tasks, {
        'lang_token_vocabs': lang_token_vocabs,
        'task_token_vocabs': task_label_vocabs,
        'components': components
    }
Exemplo n.º 3
0
feat_dim = word_embed.embedding_dim + char_embed.output_size
lstm = LSTM(feat_dim,
            args.lstm_hidden_size,
            batch_first=True,
            bidirectional=True,
            forget_bias=args.lstm_forget_bias)
crf = CRF(label_size=len(label_vocab) + 2)
linear = Linears(in_features=lstm.output_size,
                 out_features=len(label_vocab),
                 hiddens=[lstm.output_size // 2])
lstm_crf = LstmCrf(token_vocab,
                   label_vocab,
                   char_vocab,
                   word_embedding=word_embed,
                   char_embedding=char_embed,
                   crf=crf,
                   lstm=lstm,
                   univ_fc_layer=linear,
                   embed_dropout_prob=args.feat_dropout,
                   lstm_dropout_prob=args.lstm_dropout,
                   char_highway=char_hw if args.use_highway else None)
if use_gpu:
    lstm_crf.cuda()
torch.set_num_threads(args.thread)

logger.debug(lstm_crf)

# Task
optimizer = optim.SGD(filter(lambda p: p.requires_grad, lstm_crf.parameters()),
                      lr=args.lr,
                      momentum=args.momentum)
Exemplo n.º 4
0
crf = CRF(Config({
    'label_vocab': label_vocab
}))
output_linear = Linear(Config({
    'in_features': lstm.output_size,
    'out_features': len(label_vocab)
}))

# LSTM CRF Model
lstm_crf = LstmCrf(
    token_vocab=token_vocab,
    label_vocab=label_vocab,
    char_vocab=char_vocab,
    word_embedding=word_embed,
    char_embedding=char_cnn,
    crf=crf,
    lstm=lstm,
    univ_fc_layer=output_linear,
    embed_dropout_prob=args.embed_dropout,
    lstm_dropout_prob=args.lstm_dropout,
    linear_dropout_prob=args.linear_dropout,
    char_highway=char_highway
)

if use_gpu:
    torch.cuda.set_device(args.gpu_idx)
    lstm_crf.cuda()

# Task
optimizer = optim.SGD(filter(lambda p: p.requires_grad, lstm_crf.parameters()),
                      lr=args.lr, momentum=args.momentum)
Exemplo n.º 5
0
spec_linear_1_2 = Linear(in_features=lstm.output_size,
                         out_features=len(label_vocab_1))
# Linear layers for task 2
shared_linear_2 = Linear(in_features=lstm.output_size,
                         out_features=len(label_vocab_2))
spec_linear_2_1 = Linear(in_features=lstm.output_size,
                         out_features=len(label_vocab_2))
spec_linear_2_2 = Linear(in_features=lstm.output_size,
                         out_features=len(label_vocab_2))

lstm_crf_tgt = LstmCrf(token_vocab_1,
                       label_vocab_1,
                       char_vocab,
                       word_embedding=word_embed_1,
                       char_embedding=char_embed,
                       crf=crf_1,
                       lstm=lstm,
                       univ_fc_layer=shared_linear_1,
                       spec_fc_layer=spec_linear_1_1,
                       embed_dropout_prob=args.feat_dropout,
                       lstm_dropout_prob=args.lstm_dropout,
                       char_highway=char_hw if args.use_highway else None)
lstm_crf_cl = LstmCrf(token_vocab_2,
                      label_vocab_1,
                      char_vocab,
                      word_embedding=word_embed_2,
                      char_embedding=char_embed,
                      crf=crf_1,
                      lstm=lstm,
                      univ_fc_layer=shared_linear_1,
                      spec_fc_layer=spec_linear_1_2,
                      embed_dropout_prob=args.feat_dropout,
Exemplo n.º 6
0
                  layer_num=train_args['charhw_layer'],
                  activation=train_args['charhw_func'])
feat_dim = word_embed.embedding_dim + char_embed.output_size
lstm = LSTM(feat_dim,
            train_args['lstm_hidden_size'],
            batch_first=True,
            bidirectional=True,
            forget_bias=train_args['lstm_forget_bias'])
crf = CRF(label_size=len(label_vocab) + 2)
linear = Linear(in_features=lstm.output_size, out_features=len(label_vocab))
lstm_crf = LstmCrf(token_vocab,
                   label_vocab,
                   char_vocab,
                   word_embedding=word_embed,
                   char_embedding=char_embed,
                   crf=crf,
                   lstm=lstm,
                   univ_fc_layer=linear,
                   embed_dropout_prob=train_args['feat_dropout'],
                   lstm_dropout_prob=train_args['lstm_dropout'],
                   char_highway=char_hw if train_args['use_highway'] else None)

word_embed.load_state_dict(state['model']['word_embed'])
char_embed.load_state_dict(state['model']['char_embed'])
char_hw.load_state_dict(state['model']['char_hw'])
lstm.load_state_dict(state['model']['lstm'])
crf.load_state_dict(state['model']['crf'])
linear.load_state_dict(state['model']['linear'])
lstm_crf.load_state_dict(state['model']['lstm_crf'])

if use_gpu:
Exemplo n.º 7
0
    Config({
        'in_features': lstm.output_size,
        'out_features': len(label_vocab)
    }))
word_embed.load_state_dict(state['model']['word_embed'])
char_cnn.load_state_dict(state['model']['char_cnn'])
char_highway.load_state_dict(state['model']['char_highway'])
lstm.load_state_dict(state['model']['lstm'])
crf.load_state_dict(state['model']['crf'])
output_linear.load_state_dict(state['model']['output_linear'])
lstm_crf = LstmCrf(token_vocab=token_vocab,
                   label_vocab=label_vocab,
                   char_vocab=char_vocab,
                   word_embedding=word_embed,
                   char_embedding=char_cnn,
                   crf=crf,
                   lstm=lstm,
                   univ_fc_layer=output_linear,
                   embed_dropout_prob=train_args['embed_dropout'],
                   lstm_dropout_prob=train_args['lstm_dropout'],
                   linear_dropout_prob=train_args['linear_dropout'],
                   char_highway=char_highway)
lstm_crf.load_state_dict(state['model']['lstm_crf'])

if use_gpu:
    torch.cuda.set_device(args.gpu_idx)
    lstm_crf.cuda()
else:
    lstm_crf.cpu()

# Load dataset
logger.info('Loading data')
Exemplo n.º 8
0
        'out_features': len(label_vocab_2)
    }))
spec_output_linear_2_2 = Linear(
    Config({
        'in_features': lstm.output_size,
        'out_features': len(label_vocab_2)
    }))

# LSTM CRF Models
lstm_crf_tgt = LstmCrf(token_vocab=token_vocab_1,
                       label_vocab=label_vocab_1,
                       char_vocab=char_vocab,
                       word_embedding=word_embed_1,
                       char_embedding=char_cnn,
                       crf=crf_1,
                       lstm=lstm,
                       univ_fc_layer=shared_output_linear_1,
                       spec_fc_layer=spec_output_linear_1_1,
                       embed_dropout_prob=args.embed_dropout,
                       lstm_dropout_prob=args.lstm_dropout,
                       linear_dropout_prob=args.linear_dropout,
                       char_highway=char_highway)
lstm_crf_cl = LstmCrf(token_vocab=token_vocab_2,
                      label_vocab=label_vocab_1,
                      char_vocab=char_vocab,
                      word_embedding=word_embed_2,
                      char_embedding=char_cnn,
                      crf=crf_1,
                      lstm=lstm,
                      univ_fc_layer=shared_output_linear_1,
                      spec_fc_layer=spec_output_linear_1_2,