Exemplo n.º 1
0
    dev_labels = []
    test_labels = []
    train_features_tot = []
    test_word = []

    for i in range(file_num):
        dev_features0, dev_labels0 = utils.read_corpus(dev_lines[i])
        test_features0, test_labels0 = utils.read_corpus(test_lines[i])

        dev_features.append(dev_features0)
        test_features.append(test_features0)
        dev_labels.append(dev_labels0)
        test_labels.append(test_labels0)

        if args.output_annotation:  #NEW
            test_word0 = utils.read_features(test_lines[i])
            test_word.append(test_word0)

        if args.load_check_point:
            if os.path.isfile(args.load_check_point):
                print("loading checkpoint: '{}'".format(args.load_check_point))
                checkpoint_file = torch.load(args.load_check_point)
                args.start_epoch = checkpoint_file['epoch']
                f_map = checkpoint_file['f_map']
                l_map = checkpoint_file['l_map']
                c_map = checkpoint_file['c_map']
                in_doc_words = checkpoint_file['in_doc_words']
                train_features, train_labels = utils.read_corpus(lines[i])
            else:
                print("no checkpoint found at: '{}'".format(
                    args.load_check_point))
Exemplo n.º 2
0
    jd = jd['args']

    checkpoint_file = torch.load(args.load_check_point,
                                 map_location=lambda storage, loc: storage)
    f_map = checkpoint_file['f_map']
    l_map = checkpoint_file['l_map']
    if args.gpu >= 0:
        torch.cuda.set_device(args.gpu)

    # loading corpus
    print('loading corpus')
    with codecs.open(args.input_file, 'r', 'utf-8') as f:
        lines = f.readlines()

    # converting format
    features = utils.read_features(lines)

    # build model
    print('loading model')
    ner_model = LSTM_CRF(len(f_map),
                         len(l_map),
                         jd['embedding_dim'],
                         jd['hidden'],
                         jd['layers'],
                         jd['drop_out'],
                         large_CRF=jd['small_crf'])

    ner_model.load_state_dict(checkpoint_file['state_dict'])

    if args.gpu >= 0:
        if_cuda = True
Exemplo n.º 3
0
        ner_model.cuda()
        packer = CRFRepack_WC(len(l_map), True)
    else:
        if_cuda = False
        packer = CRFRepack_WC(len(l_map), False)

    decode_label = (args.decode_type == 'label')
    predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map['<eof>'], c_map['\n'], l_map['<pad>'], l_map['<start>'], decode_label, args.batch_size, jd['caseless'])

    # loading corpus
    print('loading corpus')
    lines = []
    features = []
    with codecs.open(args.input_file, 'r', 'utf-8') as f:
        for line in f:
            if line == '\n':
                features.append(utils.read_features(lines))  
                lines = []
                continue
            tmp = line.split()
            lines.append(tmp[0])

    for idx in range(args.dataset_no):
        print('annotating the entity type', idx)
        fout = open(args.output_file+str(idx)+'.txt', 'w')
        for feature in features:
            predictor.output_batch(ner_model, feature, fout, idx)
            fout.write('\n')
        fout.close()
	        
Exemplo n.º 4
0
    checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage)
    f_map = checkpoint_file['f_map']
    l_map = checkpoint_file['l_map']
    c_map = checkpoint_file['c_map']
    in_doc_words = checkpoint_file['in_doc_words']
    if args.gpu >= 0:
        torch.cuda.set_device(args.gpu)

    # loading corpus
    print('loading corpus')
    with codecs.open(args.input_file, 'r', 'utf-8') as f:
        lines = f.readlines()

    # converting format
    features = utils.read_features(lines)

    # build model
    print('loading model')
    ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers'])

    ner_model.load_state_dict(checkpoint_file['state_dict'])

    if args.gpu >= 0:
        if_cuda = True
        torch.cuda.set_device(args.gpu)
        ner_model.cuda()
    else:
        if_cuda = False

    decode_label = (args.decode_type == 'label')
Exemplo n.º 5
0
def load_pretrain_model(file_path):
    print("CSCI548 model loading")
    parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF')
    parser.add_argument('--load_arg',
                        default='./checkpoint/cwlm_lstm_crf.json',
                        help='path to arg json')
    parser.add_argument('--load_check_point',
                        default='./checkpoint/cwlm_lstm_crf.model',
                        help='path to model checkpoint file')
    parser.add_argument('--gpu', type=int, default=-1, help='gpu id')
    parser.add_argument(
        '--decode_type',
        choices=['label', 'string'],
        default='label',
        help=
        'type of decode function, set `label` to couple label with text, or set `string` to insert label into test'
    )
    parser.add_argument('--batch_size',
                        type=int,
                        default=50,
                        help='size of batch')
    parser.add_argument('--input_file',
                        default=file_path + "/test.tsv",
                        help='path to input un-annotated corpus')
    parser.add_argument('--output_file',
                        default='annotate/output',
                        help='path to output file')
    parser.add_argument('--dataset_no',
                        type=int,
                        default=1,
                        help='number of the datasets')
    args = parser.parse_args()

    print('loading dictionary')
    with open(args.load_arg, 'r') as f:
        jd = json.load(f)
    jd = jd['args']

    checkpoint_file = torch.load(args.load_check_point,
                                 map_location=lambda storage, loc: storage)
    f_map = checkpoint_file['f_map']
    l_map = checkpoint_file['l_map']
    c_map = checkpoint_file['c_map']
    in_doc_words = checkpoint_file['in_doc_words']
    if args.gpu >= 0:
        torch.cuda.set_device(args.gpu)

    # build model
    print('loading model')
    ner_model = LM_LSTM_CRF(len(l_map),
                            len(c_map),
                            jd['char_dim'],
                            jd['char_hidden'],
                            jd['char_layers'],
                            jd['word_dim'],
                            jd['word_hidden'],
                            jd['word_layers'],
                            len(f_map),
                            jd['drop_out'],
                            args.dataset_no,
                            large_CRF=jd['small_crf'],
                            if_highway=jd['high_way'],
                            in_doc_words=in_doc_words,
                            highway_layers=jd['highway_layers'])

    ner_model.load_state_dict(checkpoint_file['state_dict'])

    if args.gpu >= 0:
        if_cuda = True
        torch.cuda.set_device(args.gpu)
        ner_model.cuda()
        packer = CRFRepack_WC(len(l_map), True)
    else:
        if_cuda = False
        packer = CRFRepack_WC(len(l_map), False)

    decode_label = (args.decode_type == 'label')
    predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map['<eof>'],
                           c_map['\n'], l_map['<pad>'], l_map['<start>'],
                           decode_label, args.batch_size, jd['caseless'])
    evaluator = eval_wc(packer, l_map, args.eva_matrix)

    # loading corpus
    print('loading corpus')
    lines = []
    features = []
    with codecs.open(args.input_file, 'r', 'utf-8') as f:
        for i, line in enumerate(f):
            if i == 2000:
                break
            if line == '\n':
                features.append(utils.read_features(lines))
                lines = []
                continue
            tmp = line.split()
            lines.append(tmp[0])

    for idx in range(args.dataset_no):
        print('annotating the entity type', idx)
        with open(args.output_file + str(idx) + '.txt', 'w') as fout:
            for feature in features:
                predictor.output_batch(ner_model, feature, fout, idx)
                fout.write('\n')

    return args.output_file + str(idx) + '.txt'
Exemplo n.º 6
0
    def read_dataset(self, file_dict, dataset_name, *args, **kwargs):
        print('loading corpus')
        self.file_num = len(self.args.train_file)
        for i in range(self.file_num):
            with codecs.open(self.args.train_file[i], 'r', 'utf-8') as f:
                lines0 = f.readlines()
                lines0 = lines0[0:2000]
                # print (len(lines0))
            self.lines.append(lines0)
        for i in range(self.file_num):
            with codecs.open(self.args.dev_file[i], 'r', 'utf-8') as f:
                dev_lines0 = f.readlines()
                dev_lines0 = dev_lines0[0:2000]
            self.dev_lines.append(dev_lines0)
        for i in range(self.file_num):
            with codecs.open(self.args.test_file[i], 'r', 'utf-8') as f:
                test_lines0 = f.readlines()
                test_lines0 = test_lines0[0:2000]
            self.test_lines.append(test_lines0)

        for i in range(self.file_num):
            dev_features0, dev_labels0 = utils.read_corpus(self.dev_lines[i])
            test_features0, test_labels0 = utils.read_corpus(
                self.test_lines[i])

            self.dev_features.append(dev_features0)
            self.test_features.append(test_features0)
            self.dev_labels.append(dev_labels0)
            self.test_labels.append(test_labels0)

            if self.args.output_annotation:  # NEW
                test_word0, test_word_tag0 = utils.read_features(
                    self.test_lines[i])
                self.test_word.append(test_word0)
                self.test_word_tag.append(test_word_tag0)
            #print (len(self.test_word), len(self.test_labels))
            if self.args.load_check_point:
                if os.path.isfile(self.args.load_check_point):
                    print("loading checkpoint: '{}'".format(
                        self.args.load_check_point))
                    self.checkpoint_file = torch.load(
                        self.args.load_check_point)
                    self.args.start_epoch = self.checkpoint_file['epoch']
                    self.f_map = self.checkpoint_file['f_map']
                    self.l_map = self.checkpoint_file['l_map']
                    c_map = self.checkpoint_file['c_map']
                    self.in_doc_words = self.checkpoint_file['in_doc_words']
                    self.train_features, self.train_labels = utils.read_corpus(
                        self.lines[i])
                else:
                    print("no checkpoint found at: '{}'".format(
                        self.args.load_check_point))
            else:
                print('constructing coding table')
                train_features0, train_labels0, self.f_map, self.l_map, self.char_count = utils.generate_corpus_char(
                    self.lines[i],
                    self.f_map,
                    self.l_map,
                    self.char_count,
                    c_thresholds=self.args.mini_count,
                    if_shrink_w_feature=False)
            self.train_features.append(train_features0)
            self.train_labels.append(train_labels0)

            self.train_features_tot += train_features0

        shrink_char_count = [
            k for (k, v) in iter(self.char_count.items())
            if v >= self.args.mini_count
        ]
        self.char_map = {
            shrink_char_count[ind]: ind
            for ind in range(0, len(shrink_char_count))
        }

        self.char_map['<u>'] = len(self.char_map)  # unk for char
        self.char_map[' '] = len(self.char_map)  # concat for char
        self.char_map['\n'] = len(self.char_map)  # eof for char

        f_set = {v for v in self.f_map}
        dt_f_set = f_set
        self.f_map = utils.shrink_features(self.f_map, self.train_features_tot,
                                           self.args.mini_count)
        l_set = set()

        for i in range(self.file_num):
            dt_f_set = functools.reduce(
                lambda x, y: x | y, map(lambda t: set(t),
                                        self.dev_features[i]), dt_f_set)
            dt_f_set = functools.reduce(
                lambda x, y: x | y, map(lambda t: set(t),
                                        self.test_features[i]), dt_f_set)

            l_set = functools.reduce(lambda x, y: x | y,
                                     map(lambda t: set(t), self.dev_labels[i]),
                                     l_set)
            l_set = functools.reduce(
                lambda x, y: x | y, map(lambda t: set(t), self.test_labels[i]),
                l_set)

        if not self.args.rand_embedding:
            print("feature size: '{}'".format(len(self.f_map)))
            print('loading embedding')
            if self.args.fine_tune:  # which means does not do fine-tune
                self.f_map = {'<eof>': 0}
            self.f_map, self.embedding_tensor, self.in_doc_words = utils.load_embedding_wlm(
                self.args.emb_file,
                ' ',
                self.f_map,
                dt_f_set,
                self.args.caseless,
                self.args.unk,
                self.args.word_dim,
                shrink_to_corpus=self.args.shrink_embedding)
            print("embedding size: '{}'".format(len(self.f_map)))

        for label in l_set:

            if label not in self.l_map:
                self.l_map[label] = len(self.l_map)

        print('constructing dataset')
        for i in range(self.file_num):
            # construct dataset
            dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(
                self.train_features[i], self.train_labels[i], self.l_map,
                self.char_map, self.f_map, self.args.caseless)
            dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(
                self.dev_features[i], self.dev_labels[i], self.l_map,
                self.char_map, self.f_map, self.args.caseless)
            test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(
                self.test_features[i], self.test_labels[i], self.l_map,
                self.char_map, self.f_map, self.args.caseless)
            self.dataset_loader.append([
                torch.utils.data.DataLoader(tup,
                                            self.args.batch_size,
                                            shuffle=True,
                                            drop_last=False) for tup in dataset
            ])
            self.dev_dataset_loader.append([
                torch.utils.data.DataLoader(tup,
                                            50,
                                            shuffle=False,
                                            drop_last=False)
                for tup in dev_dataset
            ])
            self.test_dataset_loader.append([
                torch.utils.data.DataLoader(tup,
                                            50,
                                            shuffle=False,
                                            drop_last=False)
                for tup in test_dataset
            ])