示例#1
0
def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None:
    """
    Read the input files and write the vectors to the output files
    :param elmo: ELMo embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)
    if batch_size < 1: # Not using batch
        for sent in tqdm(all_sents, desc="Elmo Embedding"):        
            elmo_vecs = elmo.embed_sentence(sent) 
            vec = parse_sentence(elmo_vecs, mode=mode)    
            all_vecs.append(vec)
    else:   # Batched prediction
        for elmo_vecs in tqdm(elmo.embed_sentences(all_sents, batch_size=batch_size), desc="Elmo Embedding", total=len(all_sents)):
            vec = parse_sentence(elmo_vecs, mode=mode)
            all_vecs.append(vec)

    print("Finishing embedding ELMo sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
示例#2
0
def read_parse_write(tokenizer: BertTokenizer, bert_client: BertClient,
                     infile: str, outfile: str, mode) -> None:
    """
    Read the input files and write the vectors to the output files
    :param bert_client: BertClient
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of bert word piece
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)
    for sent in tqdm(all_sents, desc="BERT encoding"):
        word_piece_tokens, word_to_piece_index = bert_tokenize_words(tokenizer,
                                                                     sent,
                                                                     mode=mode)
        bert_vec = np.squeeze(bert_client.encode([word_piece_tokens],
                                                 is_tokenized=True),
                              axis=0)[1:-1, :]  ## exclude the [CLS] and [SEP]
        bert_vec = bert_vec[word_to_piece_index, :]
        print(bert_vec.shape)
        all_vecs.append(bert_vec)

    print("Finishing embedding BERT sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
示例#3
0
def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average") -> None:
    """
    Read the input files and write the vectors to the output files
    :param elmo: ELMo embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    for inst in tqdm(insts):
        vec = parse_sentence(elmo, inst.input.words, mode=mode)#Remove pos_tags argument for model without additional embeedding for materials
        all_vecs.append(vec)
    print("Finishing embedding ELMo sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
示例#4
0
def read_parse_write(bert: DistilBertModel,
                     bert_path: str,
                     infile: str,
                     outfile: str,
                     mode: str = "average",
                     batch_size=0) -> None:
    """
    Read the input files and write the vectors to the output files
    :param bert: Bert embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)

    dataset = CustomDataset(all_sents, bert_path)

    batch_size = max(1, batch_size)  # make sure batch_size is gt 0
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=4)
    for _, (batch, n_pads) in tqdm(enumerate(dataloader)):
        with torch.no_grad():
            batch = batch.cuda() if CUDA else batch
            bert = bert.cuda() if CUDA else bert

            bert_batch_vecs = bert(batch)[0].cpu().numpy()
            vectors = parse_sentence(bert_batch_vecs, mode=mode)
            for j in range(vectors.shape[0]):
                all_vecs.append(vectors[j, :-n_pads[j], :])

    print("Finishing embedding Bert sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
示例#5
0
def main():
    TASKS = ['ner_german', 'ner']
    USE_DEV = True

    char_set = set()
    for task in TASKS:

        t = __import__(task)
        data_list = [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA]
        char_index, _ = t.create_char_index(data_list)
        for k, v in char_index.items():
            char_set.add(k)
    char_index, char_cnt = {}, 0
    for char in char_set:
        char_index[char] = char_cnt
        char_cnt += 1

    for i, task in enumerate(TASKS):
        t = __import__(task)
        word_index, word_cnt = t.create_word_index(
            [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA])
        wx, y, m = t.read_data(t.TRAIN_DATA, word_index)
        if USE_DEV and task == 'ner':
            dev_wx, dev_y, dev_m = t.read_data(t.TEST_DATA, word_index)
            wx, y, m = np.vstack((wx, dev_wx)), np.vstack(
                (y, dev_y)), np.vstack((m, dev_m))
        twx, ty, tm = t.read_data(t.DEV_DATA, word_index)
        x, cm = t.read_char_data(t.TRAIN_DATA, char_index)
        if USE_DEV and task == 'ner':
            dev_x, dev_cm = t.read_char_data(t.TEST_DATA, char_index)
            x, cm = np.vstack((x, dev_x)), np.vstack((cm, dev_cm))
        tx, tcm = t.read_char_data(t.DEV_DATA, char_index)
        if task == 'ner':
            list_prefix = t.read_list()
            gaze = t.read_list_data(t.TRAIN_DATA, list_prefix)
            tgaze = t.read_list_data(t.DEV_DATA, list_prefix)
            if USE_DEV:
                dev_gaze = t.read_list_data(t.TEST_DATA, list_prefix)
                gaze = np.vstack((gaze, dev_gaze))
        else:
            gaze, tgaze = None, None

    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num, True)
    devs = reader.read_txt(conf.dev_file, conf.dev_num, False)
    tests = reader.read_txt(conf.test_file, conf.test_num, False)
    trains_target = reader.read_txt(conf.train_target_file_file,
                                    conf.train_num, True)

    if conf.context_emb != ContextEmb.none:
        print('Loading the elmo vectors for all datasets.')
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        reader.load_elmo_vec(
            conf.dev_file + "." + conf.context_emb.name + ".vec", devs)
        reader.load_elmo_vec(
            conf.test_file + "." + conf.context_emb.name + ".vec", tests)
    conf.use_iobes(trains)
    conf.use_iobes(devs)
    conf.use_iobes(tests)
    conf.build_label_idx(trains)
    conf.use_iobes(trains_target)
    conf.build_label_idx_target(trains_target)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    ids_train = conf.map_insts_ids(trains)
    ids_dev = conf.map_insts_ids(devs)
    ids_test = conf.map_insts_ids(tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        learn_from_insts(conf, conf.num_epochs, trains, devs, tests)
    else:
        ## Load the trained model.
        test_model(conf, tests)
        # pass

    print(opt.mode)