示例#1
0
def main():
    fdir = config.data_dir
    train_data, _ = data_load.load_dataset(f'{fdir}/train/in.txt',
                                           f'{fdir}/train/out.txt')
    test_data, _ = data_load.load_dataset(f'{fdir}/test/in.txt',
                                          f'{fdir}/test/out.txt')
    print(cal_total_max_len([train_data, test_data]))
示例#2
0
文件: train.py 项目: zxlzr/CoupletAI
def init_dataset(seq_path, tag_path, word_to_ix, max_seq_len, batch_size):
    seqs, tags = load_dataset(seq_path, tag_path)
    seqs, masks, tags = create_dataset(seqs, tags, word_to_ix, max_seq_len,
                                       word_to_ix['[PAD]'])
    extended_attention_mask = create_attention_mask(masks)
    dataset = TensorDataset(seqs, extended_attention_mask, tags)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
示例#3
0
def process_data(hp):
    tokenizer = create_tokenizer_from_hub_module(hp)

    train = load_dataset('./input_data/train.csv')
    test = load_dataset('./input_data/test.csv')
    # train = train.sample(5000)
    # test = test.sample(5000)

    sfolder = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
    for kfold_num, (train_idx, eval_idx) in enumerate(
            sfolder.split(train[hp.DATA_COLUMN], train[hp.polarity])):
        # Use the InputExample class from BERT's run_classifier code to create examples from the data
        train_InputExamples = train.loc[train_idx].apply(
            lambda x: run_classifier_custom.InputExample(
                guid=None,
                # Globally unique ID for
                # bookkeeping,
                # unused in this example
                text_a=x[hp.DATA_COLUMN],
                selected_text=x[hp.selected_text],
                text_b=x[hp.sentiment],
                sentiment=x[hp.polarity]),
            axis=1)

        eval_InputExamples = train.loc[eval_idx].apply(
            lambda x: run_classifier_custom.InputExample(
                guid=None,
                text_a=x[hp.DATA_COLUMN],
                selected_text=x[hp.selected_text],
                text_b=x[hp.sentiment],
                sentiment=x[hp.polarity]),
            axis=1)
        break

    # print(tokenizer.tokenize("This here's an example of using the BERT tokenizer"))

    # Convert our train and test features to InputFeatures that BERT understands.
    train_features = run_classifier_custom.convert_examples_to_features(
        train_InputExamples, hp.MAX_SEQ_LENGTH, tokenizer, is_predicting=False)
    eavl_features = run_classifier_custom.convert_examples_to_features(
        eval_InputExamples, hp.MAX_SEQ_LENGTH, tokenizer, is_predicting=False)

    return train_features, eavl_features
示例#4
0
def process_test_data(hp):
    tokenizer = create_tokenizer_from_hub_module(hp)
    test = load_dataset('./input_data/test.csv')
    test_InputExamples = test.apply(
        lambda x: run_classifier_custom.InputExample(guid=None,
                                                     text_a=x[hp.DATA_COLUMN],
                                                     selected_text=None,
                                                     text_b=x[hp.sentiment],
                                                     sentiment=x[hp.polarity]),
        axis=1)
    test_features = run_classifier_custom.convert_examples_to_features(
        test_InputExamples, hp.MAX_SEQ_LENGTH, tokenizer, is_predicting=True)
    return test_features
示例#5
0
    """Convert mask to attention mask.
    """
    extended_attention_mask = raw_mask.unsqueeze(1).unsqueeze(2)
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask.float()


def create_transformer_attention_mask(raw_mask: torch.Tensor) -> torch.Tensor:
    """Convert mask to transformer attention mask.
    """
    return (1 - raw_mask).bool()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--dir", default='tensor_dataset', type=str)
    parser.add_argument("--max_len", default=32, type=int)

    args = parser.parse_args()
    seq_path = f'{config.data_dir}/train/in.txt'
    tag_path = f'{config.data_dir}/train/out.txt'
    vocab_path = f'{config.data_dir}/vocabs'
    max_seq_len = args.max_len

    word_to_ix = load_vocab(vocab_path)
    vocab_size = len(word_to_ix)
    seqs, tags = load_dataset(seq_path, tag_path)
    seqs, masks, tags = create_dataset(seqs, tags, word_to_ix, max_seq_len,
                                       word_to_ix['[PAD]'])
    save_dataset(seqs, masks, tags, args.dir)