Пример #1
0
def bert_service(input, output, lang, prefix, n_col):
    df = pd.read_csv(input, delimiter='\t', usecols=[1, n_col], header=None)
    df.columns = ['tense', 'text_' + lang]
    X_list = df['text_' + lang].values.tolist()
    Y = np.array(df['tense'])
    list_vec = bc.encode(X_list)
    bp.pack_ndarray_to_file(list_vec, output + '/' + prefix + "_X.blp")
    bp.pack_ndarray_to_file(Y, output + '/' + prefix + "_Y.blp")
def make_SVM_examples(dataID,n_features,prefix="train"):
    n_samples = len(dataID)
    X_matrix = np.zeros((n_samples, n_features))
    Y = []
    id_sample = 0
    for doc_idx in dataID:
        for deptree in zh_trees[doc_idx]:
            sentID = deptree.sentence_id
            if len(str(sentID)) < 3:
                sentID_str = '00' + str(sentID)
            else:
                sentID_str = str(sentID)
            # ignore the first sentence (the title) of every document
            if sentID_str[-2:] == "00":
                continue
            else:
                xfeatures, y = tree2features(deptree)
                X_matrix[id_sample] = xfeatures
                id_sample += 1
                Y.append(y)
    bp.pack_ndarray_to_file(X_matrix,args.featOutput + '/'+ prefix + "_X.blp")
    bp.pack_ndarray_to_file(np.array(Y), args.featOutput + '/' + prefix + "_Y.blp")
Пример #3
0
        for player_num in range(0, 4):
            X, Y = generate_training_set(file_name, player_num, history_num=4)
            # print('initial x',X[0][0])
            # print('initial y',Y[0])
            # print('shape of X', np.array(X).shape)
            master_X += X
            master_Y += Y
            # print(np.array(master_X).shape)
            # print('mx',master_X[0][0:2])
            # print('my',master_Y[0:2])

        if len(master_X) > 100000:
            import bloscpack as bp
            master_X = np.array(master_X)
            master_Y = np.array(master_Y)
            saving_file_count += len(master_X)
            bp.pack_ndarray_to_file(
                master_X,
                f'processed_data_blp/input_X_{file_count}_{saving_file_count}.nosync.blp'
            )
            bp.pack_ndarray_to_file(
                master_Y,
                f'processed_data_blp/input_Y_{file_count}_{saving_file_count}.nosync.blp'
            )
            file_count += len(master_X)
            print('Saved with the file size', saving_file_count)

            master_X = []
            master_Y = []
            break
Пример #4
0
    sort_R = np.argsort(rank_R)
    dat_R = dat_L[sort_R]
    lbl_R = lbl_L[sort_R]

    del extracted

    # ------------------------
    # some checks
    assert np.all(lbl_L == lbl_R)
    feat = np.array(feat_L + feat_R)
    dat = np.concatenate([dat_L, dat_R], axis=1)
    lbl = lbl_L

    # ------------------------
    # save and clean up
    bp.pack_ndarray_to_file(
        feat, '../input/trn_feat_g{:d}_w{:d}_fix.bp'.format(i, wndw))
    bp.pack_ndarray_to_file(
        dat, '../input/trn_dat_g{:d}_w{:d}_fix.bp'.format(i, wndw))
    bp.pack_ndarray_to_file(
        lbl, '../input/trn_lbl_g{:d}_w{:d}_fix.bp'.format(i, wndw))

    del feature_extraction_ndcs
    del sgnl_L, sgnl_R
    del feat_L, dat_L, lbl_L, rank_L, sort_L
    del feat_R, dat_R, lbl_R, rank_R, sort_R
    del feat, dat, lbl

    gc.collect()

    elapsed = (datetime.now() - start_time).seconds
    print('total time elapsed {:d} minutes {:d} seconds.'.format(
Пример #5
0
                xfeatures, y = tree2features(deptree)
                X_matrix[id_sample] = xfeatures
                id_sample += 1
                Y.append(y)
    return X_matrix, np.array(Y)


X_train, y_train = make_samples(train_idx, n_train_samples, n_features)
X_dev, y_dev = make_samples(dev_idx, n_dev_samples, n_features)
X_test, y_test = make_samples(test_idx, n_test_samples, n_features)

tsizeMB = sum(i.size * i.itemsize for i in (X_train, X_dev, X_test)) / 2**20.
#blosc_args = bp.DEFAULT_BLOSC_ARGS
#blosc_args['clevel'] = 6
t = time.time()
bp.pack_ndarray_to_file(X_train,
                        '../data/X_train.blp')  #, blosc_args=blosc_args)
bp.pack_ndarray_to_file(y_train, '../data/y_train.blp')
bp.pack_ndarray_to_file(X_dev, '../data/X_dev.blp')  #, blosc_args=blosc_args)
bp.pack_ndarray_to_file(y_dev, '../data/y_dev.blp')
bp.pack_ndarray_to_file(X_test,
                        '../data/X_test.blp')  #, blosc_args=blosc_args)
bp.pack_ndarray_to_file(y_test, '../data/y_test.blp')
print(y_test)
print(type(y_test))
print(y_test.shape)

t1 = time.time() - t
print("store time = %.2f (%.2f MB/s)" % (t1, tsizeMB / t1))
"""
print(X_train.shape)
print(len(y_train))
Пример #6
0
    return X_matrix, np.array(Y)


X_train, y_train = make_samples(train_treebank, len(train_treebank),
                                n_features)
#X_dev,y_dev = make_samples(dev_idx,n_dev_samples,n_features)
X_test, y_test = make_samples(test_treebank, len(test_treebank), n_features)
#no_pres_X_train,no_pres_y_train = make_samples(no_pres_train_treebank,len(no_pres_train_treebank),n_features,Pres=False)
#no_pres_X_test,no_pres_y_test = make_samples(no_pres_test_treebank,len(no_pres_test_treebank),n_features,Pres=False)
#tsizeMB = sum(i.size*i.itemsize for i in (X_train,X_test))/2**20.

#blosc_args = bp.DEFAULT_BLOSC_ARGS
#blosc_args['clevel'] = 6
t = time.time()

bp.pack_ndarray_to_file(y_train, '../data/y_train.blp')
bp.pack_ndarray_to_file(y_test, '../data/y_test.blp')
#print(y_test)
#print(type(y_test))
#print(y_test.shape)

#t1 = time.time() - t
#print("store time = %.2f (%.2f MB/s)" % (t1, tsizeMB/t1))
#t = time.time()
X_train = sp.csr_matrix(X_train)
X_test = sp.csr_matrix(X_test)
sp.save_npz('../data/X_train.npz', X_train)
sp.save_npz('../data/X_test.npz', X_test)
t1 = time.time() - t
print("store time = %.2f " % (t1))
Пример #7
0
                              n_features,
                              Pres=True,
                              y_test_Pres=True)
no_pres_X_train, no_pres_y_train = make_samples(no_pres_train_samples,
                                                len(no_pres_train_samples),
                                                n_features,
                                                Pres=False,
                                                y_test_Pres=False)
#no_pres_X_test,no_pres_y_test = make_samples(no_pres_test_treebank,len(no_pres_test_treebank),n_features,Pres=False,y_test_Pres=False)
#tsizeMB = sum(i.size*i.itemsize for i in (X_train,X_test))/2**20.

#blosc_args = bp.DEFAULT_BLOSC_ARGS
#blosc_args['clevel'] = 6
t = time.time()

bp.pack_ndarray_to_file(y_train, '../data/binM_y_train.blp')
bp.pack_ndarray_to_file(y_test, '../data/binM_y_test.blp')
bp.pack_ndarray_to_file(no_pres_y_train, '../data/binM_no_pres_y_train.blp')
#bp.pack_ndarray_to_file(y_test, '../data/p2_y_test.blp')
#print(y_test)
#print(type(y_test))
#print(y_test.shape)

#t1 = time.time() - t
#print("store time = %.2f (%.2f MB/s)" % (t1, tsizeMB/t1))
#t = time.time()
X_train = sp.csr_matrix(X_train)
X_test = sp.csr_matrix(X_test)
sp.save_npz('../data/binM_X_train.npz', X_train)
sp.save_npz('../data/binM_X_test.npz', X_test)
no_pres_X_train = sp.csr_matrix(no_pres_X_train)
Пример #8
0
#-*-coding:utf-8-*-
import numpy as np
import pandas as pd
from bert_serving.client import BertClient
import time
import bloscpack as bp
#import pkuseg

t=time.time()
bc = BertClient()

df_test = pd.read_csv('../data/test.tsv', delimiter='\t',header=0,names=['sentID','text_zh'])
#df_test = pd.read_csv('../data/test.tsv', delimiter='\t',usecols=[0,3],header=None)
#df_test.columns=['sentID','text_zh']
test_X_list = df_test['text_zh'].values.tolist()
test_idx = np.array(df_test['sentID'])
#print(test_X_list[:2])
#print(type(dev_X_list[:2]))

list_vec = bc.encode(test_X_list)
print(list_vec.shape)
#print(test_idx.shape)
bp.pack_ndarray_to_file(list_vec, '../data/fine_bert_test_X.blp')
bp.pack_ndarray_to_file(test_idx, '../data/fine_bert_test_idx.blp')
t1=time.time() - t
print("#conversion time: ",t1)
    #bp.pack_ndarray_to_file(dat, '../input/tst_dat_neighbour_quantile_g{:d}_w{:d}.bp'.format(i, wndw))

    del feature_extraction_ndcs
    del sgnl_L, sgnl_R
    del feat_L, dat_L, rank_L, sort_L
    del feat_R, dat_R, rank_R, sort_R
    del feat  #, dat

    gc.collect()

    elapsed = (datetime.now() - start_time).seconds
    print('total time elapsed {:d} minutes {:d} seconds.'.format(
        elapsed // 60, elapsed % 60))

bp.pack_ndarray_to_file(
    np.concatenate(tst_dat_collection, 0),
    '../input/tst_dat_neighbour_quantile_all_w{:d}.bp'.format(wndw))

# ==================================================================

trn_dat_collection = []

for i in range(10):
    print('============================')
    print('processing group #{:d}...'.format(i))
    sgnl_ndcs = batch_id_trn[i]
    sgnl = pdf_trn['signal'].iloc[sgnl_ndcs]
    trgt = pdf_trn['open_channels'].iloc[sgnl_ndcs]

    sgnl_L = pd.concat([pd.Series([np.nan] * (wndw - 1)), sgnl])
    sgnl_R = pd.concat([sgnl, pd.Series([np.nan] * (wndw - 1))])