def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default=None, type=str, required=True) parser.add_argument('--w2v_path', default=None, type=str, required=True) parser.add_argument('--labels', default=None, type=str, required=True) parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--epochs', default=3, type=int) parser.add_argument('--logging_steps', default=20, type=int) parser.add_argument('--learning_rate', default=5e-3, type=float) args = parser.parse_args() args.device = torch.device('cuda') labels = get_labels(args.labels) glove = GloVe(cache=args.w2v_path) # model model = LstmCrf(w2v=glove, num_tags=len(labels), hidden_dim=512) model.to(args.device) # dataset train_dataset = NerDataset(args.data_dir, labels, glove, mode='train') eval_dataset = NerDataset(args.data_dir, labels, glove, mode='dev') # train train(args, model, train_dataset) # eval result = eval(args, model, eval_dataset, labels) print(result)
def build_tasks_from_file(conf_path, options=None): if type(conf_path) is str: conf = Config.read(conf_path) elif type(conf_path) is Config: conf = conf_path else: raise TypeError('Unknown configuration type. Expect str or Config.') if options: for k, v in options: conf.update_value(k, v) # Create data sets logger.info('Loading data sets') datasets = {} lang_datasets = defaultdict(list) task_datasets = defaultdict(list) for dataset in conf.datasets: parser = create_parser(dataset.parser.format, dataset.parser) (train_conf, dev_conf, test_conf) = dataset.clone(), dataset.clone(), dataset.clone() train_conf.update({'path': dataset.files.train, 'parser': parser}) dev_conf.update({ 'path': dataset.files.dev, 'parser': parser, 'sample': None }) train_dataset = create_dataset(dataset.type, train_conf) dev_dataset = create_dataset(dataset.type, dev_conf) if hasattr(dataset.files, 'test'): test_conf.update({ 'path': dataset.files.test, 'parser': parser, 'sample': None }) test_dataset = create_dataset(dataset.type, test_conf) datasets[dataset.name] = { 'train': train_dataset, 'dev': dev_dataset, 'test': test_dataset, 'language': dataset.language, 'task': dataset.task } lang_datasets[dataset.language].append(dataset.name) task_datasets[dataset.task].append(dataset.name) # Create vocabs # I only keep words in the data sets to save memory # If the model will be applied to an unknown test set, it is better to keep # all words in pre-trained embeddings. logger.info('Creating vocabularies') dataset_counts = {} lang_token_vocabs = {} task_label_vocabs = {} for name, ds in datasets.items(): dataset_counts[name] = compute_metadata( [ds['train'], ds['dev'], ds['test']]) for lang, ds in lang_datasets.items(): counts = [dataset_counts[d][0] for d in ds] lang_token_vocabs[lang] = count2vocab(counts, ignore_case=True, start_idx=2) for task, ds in task_datasets.items(): counts = [dataset_counts[d][1] for d in ds] task_label_vocabs[task] = count2vocab(counts, ignore_case=False, start_idx=0, sort=True) char_vocab = count2vocab([c[2] for c in dataset_counts.values()], ignore_case=False, start_idx=1) # Report stats for lang, vocab in lang_token_vocabs.items(): logger.info('#{} token: {}'.format(lang, len(vocab))) for task, vocab in task_label_vocabs.items(): logger.info('#{} label: {}'.format(task, len(vocab))) logger.info(vocab) # Numberize datasets logger.info('Numberizing data sets') numberize_conf = [] for ds in datasets.values(): numberize_conf.append((ds['train'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_conf.append((ds['dev'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_conf.append((ds['test'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_datasets(numberize_conf, token_ignore_case=True, label_ignore_case=False, char_ignore_case=False) # Initialize component confs logger.info('Initializing component configurations') word_embed_dim = char_embed_dim = lstm_output_dim = 0 cpnt_confs = {} for cpnt in conf.components: if cpnt.model == 'embedding': cpnt.embedding_dim = cpnt.dimension word_embed_dim = cpnt.dimension elif cpnt.model == 'char_cnn': cpnt.vocab_size = len(char_vocab) char_embed_dim = sum([x[1] for x in cpnt.filters]) elif cpnt.model == 'lstm': lstm_output_dim = cpnt.hidden_size * (2 if cpnt.bidirectional else 1) cpnt_confs[cpnt.name] = cpnt.clone() # Update component configurations target_task = '' target_lang = '' for task_conf in conf.tasks: language = task_conf.language task = task_conf.task if task_conf.get('ref', False): target_lang = language target_task = task model_conf = task_conf.model if model_conf.model != 'lstm_crf': continue # Update word embedding configuration cpnt_confs[model_conf.word_embed].num_embeddings = len( lang_token_vocabs[language]) cpnt_confs[model_conf.word_embed].vocab = lang_token_vocabs[language] # Update output layer configuration cpnt_confs[model_conf.univ_layer].out_features = len( task_label_vocabs[task]) if hasattr(model_conf, 'spec_layer'): cpnt_confs[model_conf.spec_layer].out_features = len( task_label_vocabs[task]) # Update CRF configuration cpnt_confs[model_conf.crf].label_vocab = task_label_vocabs[task] for _, cpnt_conf in cpnt_confs.items(): if cpnt_conf.model == 'linear' and cpnt_conf.position == 'output': cpnt_conf.in_features = lstm_output_dim if cpnt_conf.model == 'lstm': cpnt_conf.input_size = char_embed_dim + word_embed_dim if cpnt_conf.model == 'highway' and cpnt_conf.position == 'char': cpnt_conf.size = char_embed_dim # Create components logger.info('Creating components') components = {k: create_module(v.model, v) for k, v in cpnt_confs.items()} # Construct models tasks = [] for task_conf in conf.tasks: model_conf = task_conf.model language = task_conf.language task = task_conf.task if model_conf.model == 'lstm_crf': model = LstmCrf( lang_token_vocabs[language], task_label_vocabs[task], char_vocab, word_embedding=components[model_conf.word_embed], char_embedding=components[model_conf.char_embed] if hasattr( model_conf, 'char_embed') else None, crf=components[model_conf.crf], lstm=components[model_conf.lstm], input_layer=None, univ_fc_layer=components[model_conf.univ_layer], spec_fc_layer=components[model_conf.spec_layer] if hasattr( model_conf, 'spec_linear') else None, embed_dropout_prob=model_conf.embed_dropout, lstm_dropout_prob=model_conf.lstm_dropout, linear_dropout_prob=model_conf.linear_dropout, char_highway=components[model_conf.char_highway] if hasattr( model_conf, 'char_highway') else None, use_char_embedding=model_conf.use_char_embedding if hasattr( model_conf, 'use_char_embedding') else True, ) # elif model_conf.model == 'cbow': # pass else: raise ValueError('Unknown model: {}'.format(model_conf.model)) logger.debug(model) task_classes = {'ner': NameTagging, 'pos': PosTagging} if task in task_classes: task_obj = task_classes[task]( task_conf.name, model, datasets=datasets[task_conf.dataset], vocabs={ 'token': lang_token_vocabs[language], 'label': task_label_vocabs[task], 'char': char_vocab }, gpu=task_conf.gpu, # TODO: 'gpu' -> global config prob=getattr(task_conf, 'prob', 1.0), lr=getattr(task_conf, 'learning_rate', .001), momentum=getattr(task_conf, 'momentum', .9), decay_rate=getattr(task_conf, 'decay_rate', .9), decay_step=getattr(task_conf, 'decay_step', 10000), gradient_clipping=getattr(task_conf, 'gradient_clipping', 5.0), require_eval=getattr(task_conf, 'require_eval', True), ref=getattr(task_conf, 'ref', False), aux_task=task_conf.task != target_task, aux_lang=task_conf.language != target_lang, ) else: raise ValueError('Unknown task {}'.format(task)) tasks.append(task_obj) return tasks, { 'lang_token_vocabs': lang_token_vocabs, 'task_token_vocabs': task_label_vocabs, 'components': components }
feat_dim = word_embed.embedding_dim + char_embed.output_size lstm = LSTM(feat_dim, args.lstm_hidden_size, batch_first=True, bidirectional=True, forget_bias=args.lstm_forget_bias) crf = CRF(label_size=len(label_vocab) + 2) linear = Linears(in_features=lstm.output_size, out_features=len(label_vocab), hiddens=[lstm.output_size // 2]) lstm_crf = LstmCrf(token_vocab, label_vocab, char_vocab, word_embedding=word_embed, char_embedding=char_embed, crf=crf, lstm=lstm, univ_fc_layer=linear, embed_dropout_prob=args.feat_dropout, lstm_dropout_prob=args.lstm_dropout, char_highway=char_hw if args.use_highway else None) if use_gpu: lstm_crf.cuda() torch.set_num_threads(args.thread) logger.debug(lstm_crf) # Task optimizer = optim.SGD(filter(lambda p: p.requires_grad, lstm_crf.parameters()), lr=args.lr, momentum=args.momentum)
crf = CRF(Config({ 'label_vocab': label_vocab })) output_linear = Linear(Config({ 'in_features': lstm.output_size, 'out_features': len(label_vocab) })) # LSTM CRF Model lstm_crf = LstmCrf( token_vocab=token_vocab, label_vocab=label_vocab, char_vocab=char_vocab, word_embedding=word_embed, char_embedding=char_cnn, crf=crf, lstm=lstm, univ_fc_layer=output_linear, embed_dropout_prob=args.embed_dropout, lstm_dropout_prob=args.lstm_dropout, linear_dropout_prob=args.linear_dropout, char_highway=char_highway ) if use_gpu: torch.cuda.set_device(args.gpu_idx) lstm_crf.cuda() # Task optimizer = optim.SGD(filter(lambda p: p.requires_grad, lstm_crf.parameters()), lr=args.lr, momentum=args.momentum)
spec_linear_1_2 = Linear(in_features=lstm.output_size, out_features=len(label_vocab_1)) # Linear layers for task 2 shared_linear_2 = Linear(in_features=lstm.output_size, out_features=len(label_vocab_2)) spec_linear_2_1 = Linear(in_features=lstm.output_size, out_features=len(label_vocab_2)) spec_linear_2_2 = Linear(in_features=lstm.output_size, out_features=len(label_vocab_2)) lstm_crf_tgt = LstmCrf(token_vocab_1, label_vocab_1, char_vocab, word_embedding=word_embed_1, char_embedding=char_embed, crf=crf_1, lstm=lstm, univ_fc_layer=shared_linear_1, spec_fc_layer=spec_linear_1_1, embed_dropout_prob=args.feat_dropout, lstm_dropout_prob=args.lstm_dropout, char_highway=char_hw if args.use_highway else None) lstm_crf_cl = LstmCrf(token_vocab_2, label_vocab_1, char_vocab, word_embedding=word_embed_2, char_embedding=char_embed, crf=crf_1, lstm=lstm, univ_fc_layer=shared_linear_1, spec_fc_layer=spec_linear_1_2, embed_dropout_prob=args.feat_dropout,
layer_num=train_args['charhw_layer'], activation=train_args['charhw_func']) feat_dim = word_embed.embedding_dim + char_embed.output_size lstm = LSTM(feat_dim, train_args['lstm_hidden_size'], batch_first=True, bidirectional=True, forget_bias=train_args['lstm_forget_bias']) crf = CRF(label_size=len(label_vocab) + 2) linear = Linear(in_features=lstm.output_size, out_features=len(label_vocab)) lstm_crf = LstmCrf(token_vocab, label_vocab, char_vocab, word_embedding=word_embed, char_embedding=char_embed, crf=crf, lstm=lstm, univ_fc_layer=linear, embed_dropout_prob=train_args['feat_dropout'], lstm_dropout_prob=train_args['lstm_dropout'], char_highway=char_hw if train_args['use_highway'] else None) word_embed.load_state_dict(state['model']['word_embed']) char_embed.load_state_dict(state['model']['char_embed']) char_hw.load_state_dict(state['model']['char_hw']) lstm.load_state_dict(state['model']['lstm']) crf.load_state_dict(state['model']['crf']) linear.load_state_dict(state['model']['linear']) lstm_crf.load_state_dict(state['model']['lstm_crf']) if use_gpu:
Config({ 'in_features': lstm.output_size, 'out_features': len(label_vocab) })) word_embed.load_state_dict(state['model']['word_embed']) char_cnn.load_state_dict(state['model']['char_cnn']) char_highway.load_state_dict(state['model']['char_highway']) lstm.load_state_dict(state['model']['lstm']) crf.load_state_dict(state['model']['crf']) output_linear.load_state_dict(state['model']['output_linear']) lstm_crf = LstmCrf(token_vocab=token_vocab, label_vocab=label_vocab, char_vocab=char_vocab, word_embedding=word_embed, char_embedding=char_cnn, crf=crf, lstm=lstm, univ_fc_layer=output_linear, embed_dropout_prob=train_args['embed_dropout'], lstm_dropout_prob=train_args['lstm_dropout'], linear_dropout_prob=train_args['linear_dropout'], char_highway=char_highway) lstm_crf.load_state_dict(state['model']['lstm_crf']) if use_gpu: torch.cuda.set_device(args.gpu_idx) lstm_crf.cuda() else: lstm_crf.cpu() # Load dataset logger.info('Loading data')
'out_features': len(label_vocab_2) })) spec_output_linear_2_2 = Linear( Config({ 'in_features': lstm.output_size, 'out_features': len(label_vocab_2) })) # LSTM CRF Models lstm_crf_tgt = LstmCrf(token_vocab=token_vocab_1, label_vocab=label_vocab_1, char_vocab=char_vocab, word_embedding=word_embed_1, char_embedding=char_cnn, crf=crf_1, lstm=lstm, univ_fc_layer=shared_output_linear_1, spec_fc_layer=spec_output_linear_1_1, embed_dropout_prob=args.embed_dropout, lstm_dropout_prob=args.lstm_dropout, linear_dropout_prob=args.linear_dropout, char_highway=char_highway) lstm_crf_cl = LstmCrf(token_vocab=token_vocab_2, label_vocab=label_vocab_1, char_vocab=char_vocab, word_embedding=word_embed_2, char_embedding=char_cnn, crf=crf_1, lstm=lstm, univ_fc_layer=shared_output_linear_1, spec_fc_layer=spec_output_linear_1_2,