def prepare_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \
                         to_lower=True, sort_by_len=False, vocab_path=None, score_index=3):
    assert len(datapaths) == 3, "data paths should include train, dev and test path"
    (train_x, train_y), (dev_x, dev_y), (test_x, test_y), vocab, overal_maxlen, overal_maxnum = \
        reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)

    X_train, y_train, mask_train = padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_dev, y_dev, mask_dev = padding_sentence_sequences(dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_test, y_test, mask_test = padding_sentence_sequences(test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True)

    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    dev_mean = y_dev.mean(axis=0)
    dev_std = y_dev.std(axis=0)
    test_mean = y_test.mean(axis=0)
    test_std = y_test.std(axis=0)

    Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
    Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id)
    Y_test = reader.get_model_friendly_scores(y_test, prompt_id)
    scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  dev X shape:   ' + str(X_dev.shape))
    logger.info('  test X shape:  ' + str(X_test.shape))

    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  dev Y shape:   ' + str(Y_dev.shape))
    logger.info('  test Y shape:  ' + str(Y_test.shape))

    logger.info('  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % 
                (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True)
    else:
        embedd_matrix = None
    
    return (X_train, Y_train, mask_train), (X_dev, Y_dev, mask_dev), (X_test, Y_test, mask_test), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
def prepare_sentence_data(datapath, vocab_path, embedding_path=None, embedding='glove', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, to_lower=True, sort_by_len=False):

    assert len(datapath) == 1, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts), vocab, overal_maxlen, overal_maxnum = reader.get_data(datapath, vocab_path, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False)
        
        

    X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    
        
    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    
    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    #######################################
    #这里我改了   prompt_id ->train_prompts
    #######################################
    if prompt_id ==-1:
        Y_train = reader.get_model_friendly_scores(y_train, train_prompts)
        scaled_train_mean = Y_train.mean()
    elif prompt_id!=-1:
        Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
        scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
                (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = utils.build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True)
    else:
        embedd_matrix = None

    return (X_train, Y_train, mask_train), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
示例#3
0
def prepare_sentence_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \
                         to_lower=True, sort_by_len=False, vocab_path=None, score_index=6):

    assert len(
        datapaths) == 4, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts, train_ids), (dev_x, dev_y, dev_prompts, dev_ids), (test_x, test_y, test_prompts, test_ids), vocab, overal_maxlen, overal_maxnum = \
        reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)

    train_d, max_sentnum = reader.read_description(datapaths[3],
                                                   vocab,
                                                   len(train_x),
                                                   tokenize_text=True,
                                                   to_lower=True)
    dev_d, max_sentnum = reader.read_description(datapaths[3],
                                                 vocab,
                                                 len(dev_x),
                                                 tokenize_text=True,
                                                 to_lower=True)
    test_d, max_sentnum = reader.read_description(datapaths[3],
                                                  vocab,
                                                  len(test_x),
                                                  tokenize_text=True,
                                                  to_lower=True)

    X_train, y_train, mask_train = utils.padding_sentence_sequences(
        train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_dev, y_dev, mask_dev = utils.padding_sentence_sequences(
        dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_test, y_test, mask_test = utils.padding_sentence_sequences(
        test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True)

    D_train, mask_d_train = utils.padding_des_sequences(train_d,
                                                        max_sentnum,
                                                        overal_maxlen,
                                                        post_padding=True)
    D_dev, mask_d_dev = utils.padding_des_sequences(dev_d,
                                                    max_sentnum,
                                                    overal_maxlen,
                                                    post_padding=True)
    D_test, mask_d_test = utils.padding_des_sequences(test_d,
                                                      max_sentnum,
                                                      overal_maxlen,
                                                      post_padding=True)

    if prompt_id:
        train_pmt = np.array(train_prompts, dtype='int32')
        dev_pmt = np.array(dev_prompts, dtype='int32')
        test_pmt = np.array(test_prompts, dtype='int32')

    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    dev_mean = y_dev.mean(axis=0)
    dev_std = y_dev.std(axis=0)
    test_mean = y_test.mean(axis=0)
    test_std = y_test.std(axis=0)

    # We need the dev and test sets in the original scale for evaluation
    # dev_y_org = y_dev.astype(reader.get_ref_dtype())
    # test_y_org = y_test.astype(reader.get_ref_dtype())

    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
    Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id)
    Y_test = reader.get_model_friendly_scores(y_test, prompt_id)
    scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  dev X shape:   ' + str(X_dev.shape))
    logger.info('  test X shape:  ' + str(X_test.shape))

    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  dev Y shape:   ' + str(Y_dev.shape))
    logger.info('  test Y shape:  ' + str(Y_test.shape))

    logger.info(
        '  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
        (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(
            embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = utils.build_embedd_table(vocab,
                                                 embedd_dict,
                                                 embedd_dim,
                                                 logger,
                                                 caseless=True)
    else:
        embedd_matrix = None

    return (X_train, Y_train, D_train, mask_train, train_ids), (X_dev, Y_dev, D_dev, mask_dev, dev_ids), (X_test, Y_test, D_test ,mask_test, test_ids), \
            vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, max_sentnum, scaled_train_mean