示例#1
0
def build_dataloader(token_features,
                     labels,
                     batch_size,
                     missing_tagspace,
                     corpus_mask_value,
                     tag2idx,
                     chr2idx,
                     token2idx,
                     caseless,
                     shuffle=False,
                     drop_last=False):
    dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(
        token_features, labels, missing_tagspace, tag2idx, chr2idx, token2idx,
        caseless, corpus_mask_value)
    dataloader = [
        torch.utils.data.DataLoader(tup,
                                    batch_size,
                                    shuffle=shuffle,
                                    drop_last=drop_last) for tup in dataset
    ]
    return dataloader
示例#2
0
    ALLOW_SPANLEN = checkpoint_file['ALLOW_SPANLEN']

    with codecs.open(args.dev_file, 'r', 'utf-8') as f:
        dev_lines = f.readlines()

    with codecs.open(args.test_file, 'r', 'utf-8') as f:
        test_lines = f.readlines()

    dev_features, dev_labels = utils.read_corpus(dev_lines)
    test_features, test_labels = utils.read_corpus(test_lines)

    dev_dataset = utils.construct_bucket_mean_vb_wc(
        dev_features,
        dev_labels,
        CRF_l_map,
        SCRF_l_map,
        c_map,
        f_map,
        SCRF_stop_tag=SCRF_l_map['<STOP>'],
        train_set=False)
    test_dataset = utils.construct_bucket_mean_vb_wc(
        test_features,
        test_labels,
        CRF_l_map,
        SCRF_l_map,
        c_map,
        f_map,
        SCRF_stop_tag=SCRF_l_map['<STOP>'],
        train_set=False)

    dev_dataset_loader = [
示例#3
0
                                       c_thresholds=args.mini_count,
                                       if_shrink_w_feature=False)

        f_set = {v for v in f_map}

        f_map = utils.shrink_features(f_map, train_features, args.mini_count)
        dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set)
        dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set)

        f_map, embedding_tensor, in_doc_words = utils.load_embedding(args.emb_file, ' ', f_map, dt_f_set, args.unk, args.word_embedding_dim, shrink_to_corpus=args.shrink_embedding)

        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels))
        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set)

    print('constructing dataset')
    dataset, dataset_onlycrf = utils.construct_bucket_mean_vb_wc(train_features, train_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], ALLOW_SPANLEN=args.allowspan, train_set=True)
    dev_dataset = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False)
    test_dataset = utils.construct_bucket_mean_vb_wc(test_features, test_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False)

    dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset]
    dataset_loader_crf = [torch.utils.data.DataLoader(tup, 3, shuffle=True, drop_last=False) for tup in dataset_onlycrf] if dataset_onlycrf else None
    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]

    print('building model')
    model = ner_model(args.word_embedding_dim, args.word_hidden_dim, args.word_lstm_layers, len(f_map),
                      len(c_map), args.char_embedding_dim, args.char_lstm_hidden_dim, args.cnn_filter_num,
                      args.char_lstm_layers, args.char_lstm, args.dropout_ratio, args.high_way, args.highway_layers,
                      CRF_l_map['<start>'], CRF_l_map['<pad>'], len(CRF_l_map), SCRF_l_map, args.scrf_dense_dim,
                      in_doc_words,args.index_embeds_dim, args.allowspan, SCRF_l_map['<START>'], SCRF_l_map['<STOP>'], args.grconv)
        torch.cuda.set_device(args.gpu)

    # load corpus
    if args.test_file:
        with codecs.open(args.test_file, 'r', 'utf-8') as f:
            test_lines = f.readlines()
    else:
        with codecs.open(jd['test_file'], 'r', 'utf-8') as f:
            test_lines = f.readlines()

    # converting format

    test_features, test_labels = utils.read_corpus(test_lines)

    # construct dataset
    test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(
        test_features, test_labels, l_map, c_map, f_map, jd['caseless'])

    test_dataset_loader = [
        torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False)
        for tup in test_dataset
    ]

    # build model
    ner_model = LM_LSTM_CRF(len(l_map),
                            len(c_map),
                            jd['char_dim'],
                            jd['char_hidden'],
                            jd['char_layers'],
                            jd['word_dim'],
                            jd['word_hidden'],
                            jd['word_layers'],
示例#5
0
            dt_f_set,
            args.caseless,
            args.unk,
            args.word_dim,
            shrink_to_corpus=args.shrink_embedding)
        print("embedding size: '{}'".format(len(f_map)))

    for label in l_set:
        if label not in l_map:
            l_map[label] = len(l_map)

    print('constructing dataset')
    for i in range(file_num):
        # construct dataset
        dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(
            train_features[i], train_labels[i], l_map, char_map, f_map,
            args.caseless)
        dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(
            dev_features[i], dev_labels[i], l_map, char_map, f_map,
            args.caseless)
        test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(
            test_features[i], test_labels[i], l_map, char_map, f_map,
            args.caseless)

        dataset_loader.append([
            torch.utils.data.DataLoader(tup,
                                        args.batch_size,
                                        shuffle=True,
                                        drop_last=False) for tup in dataset
        ])
        dev_dataset_loader.append([
示例#6
0
            print("feature size: '{}'".format(len(f_map)))
            print('loading embedding')
            if args.fine_tune:  # which means does not do fine-tune
                f_map = {'<eof>': 0}
            f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk, args.word_dim, shrink_to_corpus=args.shrink_embedding)
            print("embedding size: '{}'".format(len(f_map)))

        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels))
        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set)
        for label in l_set:
            if label not in l_map:
                l_map[label] = len(l_map)
    
    print('constructing dataset')
    # construct dataset
    dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(train_features, train_labels, l_map, c_map, f_map, args.caseless)
    dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, l_map, c_map, f_map, args.caseless)
    test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, args.caseless)
    
    dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset]
    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]

    # build model
    print('building model')
    ner_model = LM_LSTM_CRF(len(l_map), len(c_map), args.char_dim, args.char_hidden, args.char_layers, args.word_dim, args.word_hidden, args.word_layers, len(f_map), args.drop_out, large_CRF=args.small_crf, if_highway=args.high_way, in_doc_words=in_doc_words, highway_layers = args.highway_layers)

    if args.load_check_point:
        ner_model.load_state_dict(checkpoint_file['state_dict'])
    else:
        if not args.rand_embedding:
示例#7
0
    c_map = checkpoint_file['c_map']
    in_doc_words = checkpoint_file['in_doc_words']
    SCRF_l_map = checkpoint_file['SCRF_l_map']
    ALLOW_SPANLEN = checkpoint_file['ALLOW_SPANLEN']

    with codecs.open(args.dev_file, 'r', 'utf-8') as f:
        dev_lines = f.readlines()

    with codecs.open(args.test_file, 'r', 'utf-8') as f:
        test_lines = f.readlines()


    dev_features, dev_labels = utils.read_corpus(dev_lines)
    test_features, test_labels = utils.read_corpus(test_lines)

    dev_dataset = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False)
    test_dataset = utils.construct_bucket_mean_vb_wc(test_features, test_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False)

    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]

    print('build model')
    model = ner_model(jd['word_embedding_dim'], jd['word_hidden_dim'], jd['word_lstm_layers'],
                      len(f_map), len(c_map), jd['char_embedding_dim'], jd['char_lstm_hidden_dim'],
                      jd['cnn_filter_num'], jd['char_lstm_layers'], jd['char_lstm'],jd['dropout_ratio'],
                      jd['high_way'], jd['highway_layers'], CRF_l_map['<start>'], CRF_l_map['<pad>'],
                      len(CRF_l_map), SCRF_l_map, jd['scrf_dense_dim'], in_doc_words,
                      jd['index_embeds_dim'], jd['allowspan'], SCRF_l_map['<START>'], SCRF_l_map['<STOP>'],
                      jd['grconv'])

    print('load model')
示例#8
0
                f_map = {'<eof>': 0}
            
            f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk, args.word_dim, shrink_to_corpus=args.shrink_embedding)
            print("embedding size: '{}'".format(len(f_map)))
            


        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels))
        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set)
        for label in l_set:
            if label not in l_map:
                l_map[label] = len(l_map)
    
    print('constructing dataset')
    # construct dataset
    dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(train_features, train_labels, l_map, c_map, f_map, args.caseless)
    dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, l_map, c_map, f_map, args.caseless)
    test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, args.caseless)
    co_dataset,_,_=utils.construct_bucket_mean_vb_wc(co_features,co_labels,l_map,c_map,f_map,args.caseless)
    
    dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset]
    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 1, shuffle=False, drop_last=False) for tup in test_dataset]
    co_dataset_loader=[torch.utils.data.DataLoader(tup,args.batch_size,shuffle=True,drop_last=False) for tup in co_dataset]
    
    
    #string match part
    word_to_id,emb_tensor,emb =utils.string_match_embedding(args.emb_file,args.word_dim)
    
    # build model
    print('building model')
示例#9
0
    def read_dataset(self, file_dict, dataset_name, *args, **kwargs):
        print('loading corpus')
        self.file_num = len(self.args.train_file)
        for i in range(self.file_num):
            with codecs.open(self.args.train_file[i], 'r', 'utf-8') as f:
                lines0 = f.readlines()
                lines0 = lines0[0:2000]
                # print (len(lines0))
            self.lines.append(lines0)
        for i in range(self.file_num):
            with codecs.open(self.args.dev_file[i], 'r', 'utf-8') as f:
                dev_lines0 = f.readlines()
                dev_lines0 = dev_lines0[0:2000]
            self.dev_lines.append(dev_lines0)
        for i in range(self.file_num):
            with codecs.open(self.args.test_file[i], 'r', 'utf-8') as f:
                test_lines0 = f.readlines()
                test_lines0 = test_lines0[0:2000]
            self.test_lines.append(test_lines0)

        for i in range(self.file_num):
            dev_features0, dev_labels0 = utils.read_corpus(self.dev_lines[i])
            test_features0, test_labels0 = utils.read_corpus(
                self.test_lines[i])

            self.dev_features.append(dev_features0)
            self.test_features.append(test_features0)
            self.dev_labels.append(dev_labels0)
            self.test_labels.append(test_labels0)

            if self.args.output_annotation:  # NEW
                test_word0, test_word_tag0 = utils.read_features(
                    self.test_lines[i])
                self.test_word.append(test_word0)
                self.test_word_tag.append(test_word_tag0)
            #print (len(self.test_word), len(self.test_labels))
            if self.args.load_check_point:
                if os.path.isfile(self.args.load_check_point):
                    print("loading checkpoint: '{}'".format(
                        self.args.load_check_point))
                    self.checkpoint_file = torch.load(
                        self.args.load_check_point)
                    self.args.start_epoch = self.checkpoint_file['epoch']
                    self.f_map = self.checkpoint_file['f_map']
                    self.l_map = self.checkpoint_file['l_map']
                    c_map = self.checkpoint_file['c_map']
                    self.in_doc_words = self.checkpoint_file['in_doc_words']
                    self.train_features, self.train_labels = utils.read_corpus(
                        self.lines[i])
                else:
                    print("no checkpoint found at: '{}'".format(
                        self.args.load_check_point))
            else:
                print('constructing coding table')
                train_features0, train_labels0, self.f_map, self.l_map, self.char_count = utils.generate_corpus_char(
                    self.lines[i],
                    self.f_map,
                    self.l_map,
                    self.char_count,
                    c_thresholds=self.args.mini_count,
                    if_shrink_w_feature=False)
            self.train_features.append(train_features0)
            self.train_labels.append(train_labels0)

            self.train_features_tot += train_features0

        shrink_char_count = [
            k for (k, v) in iter(self.char_count.items())
            if v >= self.args.mini_count
        ]
        self.char_map = {
            shrink_char_count[ind]: ind
            for ind in range(0, len(shrink_char_count))
        }

        self.char_map['<u>'] = len(self.char_map)  # unk for char
        self.char_map[' '] = len(self.char_map)  # concat for char
        self.char_map['\n'] = len(self.char_map)  # eof for char

        f_set = {v for v in self.f_map}
        dt_f_set = f_set
        self.f_map = utils.shrink_features(self.f_map, self.train_features_tot,
                                           self.args.mini_count)
        l_set = set()

        for i in range(self.file_num):
            dt_f_set = functools.reduce(
                lambda x, y: x | y, map(lambda t: set(t),
                                        self.dev_features[i]), dt_f_set)
            dt_f_set = functools.reduce(
                lambda x, y: x | y, map(lambda t: set(t),
                                        self.test_features[i]), dt_f_set)

            l_set = functools.reduce(lambda x, y: x | y,
                                     map(lambda t: set(t), self.dev_labels[i]),
                                     l_set)
            l_set = functools.reduce(
                lambda x, y: x | y, map(lambda t: set(t), self.test_labels[i]),
                l_set)

        if not self.args.rand_embedding:
            print("feature size: '{}'".format(len(self.f_map)))
            print('loading embedding')
            if self.args.fine_tune:  # which means does not do fine-tune
                self.f_map = {'<eof>': 0}
            self.f_map, self.embedding_tensor, self.in_doc_words = utils.load_embedding_wlm(
                self.args.emb_file,
                ' ',
                self.f_map,
                dt_f_set,
                self.args.caseless,
                self.args.unk,
                self.args.word_dim,
                shrink_to_corpus=self.args.shrink_embedding)
            print("embedding size: '{}'".format(len(self.f_map)))

        for label in l_set:

            if label not in self.l_map:
                self.l_map[label] = len(self.l_map)

        print('constructing dataset')
        for i in range(self.file_num):
            # construct dataset
            dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(
                self.train_features[i], self.train_labels[i], self.l_map,
                self.char_map, self.f_map, self.args.caseless)
            dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(
                self.dev_features[i], self.dev_labels[i], self.l_map,
                self.char_map, self.f_map, self.args.caseless)
            test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(
                self.test_features[i], self.test_labels[i], self.l_map,
                self.char_map, self.f_map, self.args.caseless)
            self.dataset_loader.append([
                torch.utils.data.DataLoader(tup,
                                            self.args.batch_size,
                                            shuffle=True,
                                            drop_last=False) for tup in dataset
            ])
            self.dev_dataset_loader.append([
                torch.utils.data.DataLoader(tup,
                                            50,
                                            shuffle=False,
                                            drop_last=False)
                for tup in dev_dataset
            ])
            self.test_dataset_loader.append([
                torch.utils.data.DataLoader(tup,
                                            50,
                                            shuffle=False,
                                            drop_last=False)
                for tup in test_dataset
            ])
示例#10
0

    # load corpus
    if args.test_file:
        with codecs.open(args.test_file, 'r', 'utf-8') as f:
            test_lines = f.readlines()
    else:
        with codecs.open(jd['test_file'], 'r', 'utf-8') as f:
            test_lines = f.readlines()

    # converting format

    test_features, test_labels = utils.read_corpus(test_lines)

    # construct dataset
    test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, jd['caseless'])
    
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]

    # build model
    ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers'])

    ner_model.load_state_dict(checkpoint_file['state_dict'])

    if args.gpu >= 0:
        if_cuda = True
        torch.cuda.set_device(args.gpu)
        ner_model.cuda()
        packer = CRFRepack_WC(len(l_map), True)
    else:
        if_cuda = False