예제 #1
0
def build_all_vocabs(files, output_dir, prefix=''):
    from data import ConllParser, NameTaggingDataset
    parser = ConllParser([3, -1], processor={0: C.TOKEN_PROCESSOR})
    token_counter, char_counter, label_counter = Counter(), Counter(), Counter()
    for file in files:
        dataset = NameTaggingDataset(file, parser)
        tc, cc, lc = dataset.counters
        token_counter.update(tc)
        char_counter.update(cc)
        label_counter.update(lc)
    token_vocab = counter_to_vocab(token_counter, offset=len(C.TOKEN_PADS), pads=C.TOKEN_PADS)
    char_vocab = counter_to_vocab(char_counter, offset=len(C.CHAR_PADS), pads=C.CHAR_PADS)
    label_vocab = counter_to_vocab(label_counter)

    token_vocab = [(t, c) for t, c in token_vocab.items()]
    char_vocab = [(t, c) for t, c in char_vocab.items()]
    label_vocab = [(t, c) for t, c in label_vocab.items()]

    with open(os.path.join(output_dir, '{}token.vocab.tsv'.format(prefix)),
              'w', encoding='utf-8') as w:
        for t, c in token_vocab:
            w.write('{}\t{}\n'.format(t, c))
    with open(os.path.join(output_dir, '{}char.vocab.tsv'.format(prefix)),
              'w', encoding='utf-8') as w:
        for t, c in char_vocab:
            w.write('{}\t{}\n'.format(t, c))
    with open(os.path.join(output_dir, '{}label.vocab.tsv'.format(prefix)),
              'w', encoding='utf-8') as w:
        for t, c in label_vocab:
            w.write('{}\t{}\n'.format(t, c))
예제 #2
0
report_file.flush()

# Train
for dataset in datasets:
    best_model_file = os.path.join(output_dir,
                                   '{}.model.best.mdl'.format(dataset))
    dev_result_file = os.path.join(output_dir,
                                   '{}.result.dev.bio'.format(dataset))
    test_result_file = os.path.join(output_dir,
                                    '{}.result.test.bio'.format(dataset))
    logger.info('Output directory: {}'.format(output_dir))

    # data sets
    conll_parser = ConllParser(
        # use the 3rd and last column
        [3, -1],
        # process the 3rd column with C.TOKEN_PROCESSOR
        processor={0: C.TOKEN_PROCESSOR})
    train_set = NameTaggingDataset(os.path.join(
        args.input, dataset, '{}train.tsv'.format(args.prefix)),
                                   conll_parser,
                                   gpu=use_gpu)
    dev_set = NameTaggingDataset(os.path.join(args.input, dataset,
                                              '{}dev.tsv'.format(args.prefix)),
                                 conll_parser,
                                 gpu=use_gpu)
    test_set = NameTaggingDataset(os.path.join(
        args.input, dataset, '{}test.tsv'.format(args.prefix)),
                                  conll_parser,
                                  gpu=use_gpu)
예제 #3
0
# Logging file
log_writer = None
if args.log:
    log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp))
    log_writer = open(log_file, 'a', encoding='utf-8')
    logger.addHandler(logging.FileHandler(log_file, encoding='utf-8'))
logger.info('----------')
logger.info('Parameters:')
for arg in vars(args):
    logger.info('{}: {}'.format(arg, getattr(args, arg)))
logger.info('----------')

# Data file
logger.info('Loading data sets')
parser = ConllParser(separator='\t',
                     token_col=0,
                     label_col=1,
                     skip_comment=True)
train_set = SeqLabelDataset(args.train, parser=parser)
dev_set = SeqLabelDataset(args.dev, parser=parser)
test_set = SeqLabelDataset(args.test, parser=parser)
datasets = {'train': train_set, 'dev': dev_set, 'test': test_set}

# Vocabs
logger.info('Building vocabs')
token_count, char_count, label_count = Counter(), Counter(), Counter()
for _, ds in datasets.items():
    tc, cc, lc = ds.stats()
    token_count.update(tc)
    char_count.update(cc)
    label_count.update(lc)
token_vocab = count2vocab(token_count,
예제 #4
0
    log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp))
    log_writer = open(log_file, 'a', encoding='utf-8')
    logger = get_logger(__name__, log_file=log_file)
else:
    logger = get_logger(__name__)

logger.info('----------')
logger.info('Parameters:')
for arg in vars(args):
    logger.info('{}: {}'.format(arg, getattr(args, arg)))
logger.info('----------')

# Parser for CoNLL format file
conll_parser = ConllParser(Config({
    'separator': '\t',
    'token_col': 0,
    'label_col': 1,
    'skip_comment': True,
}))

# Load datasets
logger.info('Loading datasets')
train_set = SequenceDataset(Config({
    'path': args.train, 'parser': conll_parser, 'batch_size': args.batch_size}))
dev_set = SequenceDataset(Config({
    'path': args.dev, 'parser': conll_parser}))
test_set = SequenceDataset(Config({
    'path': args.test, 'parser': conll_parser}))
datasets = {'train': train_set, 'dev': dev_set, 'test': test_set}

# Vocabs
logger.info('Building vocabularies')
logger.info('Output directory: {}'.format(output_dir))

# deterministic behavior
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

# set gpu device
use_gpu = torch.cuda.is_available()
if use_gpu:
    torch.cuda.set_device(args.device)
torch.set_num_threads(args.thread)

# data sets
conll_parser = ConllParser([3, -1], processor={0: C.TOKEN_PROCESSOR})
train_set = NameTaggingDataset(os.path.join(
    args.input, '{}train.tsv'.format(args.prefix)),
    conll_parser, gpu=use_gpu)
dev_set = NameTaggingDataset(os.path.join(
    args.input, '{}dev.tsv'.format(args.prefix)),
    conll_parser, gpu=use_gpu)
test_set = NameTaggingDataset(os.path.join(
    args.input, '{}test.tsv'.format(args.prefix)),
    conll_parser, gpu=use_gpu)

# embedding vocab
if args.embed_vocab:
    embed_vocab = load_vocab(args.embed_vocab)
else:
    embed_vocab = build_embedding_vocab(args.embed)
예제 #6
0
# Logging file
log_writer = None
if args.log:
    log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp))
    log_writer = open(log_file, 'a', encoding='utf-8')
    logger.addHandler(logging.FileHandler(log_file, encoding='utf-8'))
logger.info('----------')
logger.info('Parameters:')
for arg in vars(args):
    logger.info('{}: {}'.format(arg, getattr(args, arg)))
logger.info('----------')

# Data file
logger.info('Loading data sets')
ner_parser = ConllParser(skip_comment=True)
pos_parser = ConllParser(token_col=1, label_col=3, skip_comment=True)

train_set_tgt = SeqLabelDataset(args.train_tgt, parser=ner_parser)
dev_set_tgt = SeqLabelDataset(args.dev_tgt, parser=ner_parser)
test_set_tgt = SeqLabelDataset(args.test_tgt, parser=ner_parser)

train_set_cl = SeqLabelDataset(args.train_cl, parser=ner_parser)
dev_set_cl = SeqLabelDataset(args.dev_cl, parser=ner_parser)
test_set_cl = SeqLabelDataset(args.test_cl, parser=ner_parser)

train_set_ct = SeqLabelDataset(args.train_ct, parser=pos_parser)
dev_set_ct = SeqLabelDataset(args.dev_ct, parser=pos_parser)
test_set_ct = SeqLabelDataset(args.test_ct, parser=pos_parser)

train_set_clct = SeqLabelDataset(args.train_clct, parser=pos_parser)
예제 #7
0
# Logging file
log_writer = None
if args.log:
    log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp))
    log_writer = open(log_file, 'a', encoding='utf-8')
    logger.addHandler(logging.FileHandler(log_file, encoding='utf-8'))
logger.info('----------')
logger.info('Parameters:')
for arg in vars(args):
    logger.info('{}: {}'.format(arg, getattr(args, arg)))
logger.info('----------')

# Data file
logger.info('Loading data sets')
ner_parser = ConllParser(skip_comment=True, separator='\t')

train_set_tgt = SeqLabelDataset(args.train_tgt, parser=ner_parser)
dev_set_tgt = SeqLabelDataset(args.dev_tgt, parser=ner_parser)
test_set_tgt = SeqLabelDataset(args.test_tgt, parser=ner_parser)

train_set_cl = SeqLabelDataset(args.train_cl, parser=ner_parser)
dev_set_cl = SeqLabelDataset(args.dev_cl, parser=ner_parser)
test_set_cl = SeqLabelDataset(args.test_cl, parser=ner_parser)

datasets = {
    'tgt': {
        'train': train_set_tgt,
        'dev': dev_set_tgt,
        'test': test_set_tgt
    },
예제 #8
0
                   char_highway=char_hw if train_args['use_highway'] else None)

word_embed.load_state_dict(state['model']['word_embed'])
char_embed.load_state_dict(state['model']['char_embed'])
char_hw.load_state_dict(state['model']['char_hw'])
lstm.load_state_dict(state['model']['lstm'])
crf.load_state_dict(state['model']['crf'])
linear.load_state_dict(state['model']['linear'])
lstm_crf.load_state_dict(state['model']['lstm_crf'])

if use_gpu:
    lstm_crf.cuda()

# Load dataset
logger.info('Loading data')
parser = ConllParser()
test_set = SeqLabelDataset(data_file, parser=parser)
test_set.numberize(token_vocab, label_vocab, char_vocab)
idx_token = {v: k for k, v in token_vocab.items()}
idx_label = {v: k for k, v in label_vocab.items()}
processor = SeqLabelProcessor(gpu=use_gpu)

try:
    results = []
    dataset_loss = []
    for batch in DataLoader(test_set,
                            batch_size=50,
                            shuffle=False,
                            collate_fn=processor.process):
        tokens, labels, chars, seq_lens, char_lens = batch
        pred, loss = lstm_crf.predict(tokens, labels, seq_lens, chars,
예제 #9
0
    log_writer = open(log_file, 'a', encoding='utf-8')
    logger = get_logger(__name__, log_file=log_file)
else:
    logger = get_logger(__name__)

logger.info('----------')
logger.info('Parameters:')
for arg in vars(args):
    logger.info('{}: {}'.format(arg, getattr(args, arg)))
logger.info('----------')

# Parser for CoNLL format file
name_tagging_parser = ConllParser(
    Config({
        'separator': '\t',
        'token_col': 0,
        'label_col': 1,
        'skip_comment': True,
    }))
pos_tagging_parser = ConllParser(
    Config({
        'separator': '\t',
        'token_col': 1,
        'label_col': 3,
        'skip_comment': True,
    }))

# Load data sets
logger.info('Loading data sets')
datasets = {}
train_set_tgt = SequenceDataset(