def bert_service(input, output, lang, prefix, n_col): df = pd.read_csv(input, delimiter='\t', usecols=[1, n_col], header=None) df.columns = ['tense', 'text_' + lang] X_list = df['text_' + lang].values.tolist() Y = np.array(df['tense']) list_vec = bc.encode(X_list) bp.pack_ndarray_to_file(list_vec, output + '/' + prefix + "_X.blp") bp.pack_ndarray_to_file(Y, output + '/' + prefix + "_Y.blp")
def make_SVM_examples(dataID,n_features,prefix="train"): n_samples = len(dataID) X_matrix = np.zeros((n_samples, n_features)) Y = [] id_sample = 0 for doc_idx in dataID: for deptree in zh_trees[doc_idx]: sentID = deptree.sentence_id if len(str(sentID)) < 3: sentID_str = '00' + str(sentID) else: sentID_str = str(sentID) # ignore the first sentence (the title) of every document if sentID_str[-2:] == "00": continue else: xfeatures, y = tree2features(deptree) X_matrix[id_sample] = xfeatures id_sample += 1 Y.append(y) bp.pack_ndarray_to_file(X_matrix,args.featOutput + '/'+ prefix + "_X.blp") bp.pack_ndarray_to_file(np.array(Y), args.featOutput + '/' + prefix + "_Y.blp")
for player_num in range(0, 4): X, Y = generate_training_set(file_name, player_num, history_num=4) # print('initial x',X[0][0]) # print('initial y',Y[0]) # print('shape of X', np.array(X).shape) master_X += X master_Y += Y # print(np.array(master_X).shape) # print('mx',master_X[0][0:2]) # print('my',master_Y[0:2]) if len(master_X) > 100000: import bloscpack as bp master_X = np.array(master_X) master_Y = np.array(master_Y) saving_file_count += len(master_X) bp.pack_ndarray_to_file( master_X, f'processed_data_blp/input_X_{file_count}_{saving_file_count}.nosync.blp' ) bp.pack_ndarray_to_file( master_Y, f'processed_data_blp/input_Y_{file_count}_{saving_file_count}.nosync.blp' ) file_count += len(master_X) print('Saved with the file size', saving_file_count) master_X = [] master_Y = [] break
sort_R = np.argsort(rank_R) dat_R = dat_L[sort_R] lbl_R = lbl_L[sort_R] del extracted # ------------------------ # some checks assert np.all(lbl_L == lbl_R) feat = np.array(feat_L + feat_R) dat = np.concatenate([dat_L, dat_R], axis=1) lbl = lbl_L # ------------------------ # save and clean up bp.pack_ndarray_to_file( feat, '../input/trn_feat_g{:d}_w{:d}_fix.bp'.format(i, wndw)) bp.pack_ndarray_to_file( dat, '../input/trn_dat_g{:d}_w{:d}_fix.bp'.format(i, wndw)) bp.pack_ndarray_to_file( lbl, '../input/trn_lbl_g{:d}_w{:d}_fix.bp'.format(i, wndw)) del feature_extraction_ndcs del sgnl_L, sgnl_R del feat_L, dat_L, lbl_L, rank_L, sort_L del feat_R, dat_R, lbl_R, rank_R, sort_R del feat, dat, lbl gc.collect() elapsed = (datetime.now() - start_time).seconds print('total time elapsed {:d} minutes {:d} seconds.'.format(
xfeatures, y = tree2features(deptree) X_matrix[id_sample] = xfeatures id_sample += 1 Y.append(y) return X_matrix, np.array(Y) X_train, y_train = make_samples(train_idx, n_train_samples, n_features) X_dev, y_dev = make_samples(dev_idx, n_dev_samples, n_features) X_test, y_test = make_samples(test_idx, n_test_samples, n_features) tsizeMB = sum(i.size * i.itemsize for i in (X_train, X_dev, X_test)) / 2**20. #blosc_args = bp.DEFAULT_BLOSC_ARGS #blosc_args['clevel'] = 6 t = time.time() bp.pack_ndarray_to_file(X_train, '../data/X_train.blp') #, blosc_args=blosc_args) bp.pack_ndarray_to_file(y_train, '../data/y_train.blp') bp.pack_ndarray_to_file(X_dev, '../data/X_dev.blp') #, blosc_args=blosc_args) bp.pack_ndarray_to_file(y_dev, '../data/y_dev.blp') bp.pack_ndarray_to_file(X_test, '../data/X_test.blp') #, blosc_args=blosc_args) bp.pack_ndarray_to_file(y_test, '../data/y_test.blp') print(y_test) print(type(y_test)) print(y_test.shape) t1 = time.time() - t print("store time = %.2f (%.2f MB/s)" % (t1, tsizeMB / t1)) """ print(X_train.shape) print(len(y_train))
return X_matrix, np.array(Y) X_train, y_train = make_samples(train_treebank, len(train_treebank), n_features) #X_dev,y_dev = make_samples(dev_idx,n_dev_samples,n_features) X_test, y_test = make_samples(test_treebank, len(test_treebank), n_features) #no_pres_X_train,no_pres_y_train = make_samples(no_pres_train_treebank,len(no_pres_train_treebank),n_features,Pres=False) #no_pres_X_test,no_pres_y_test = make_samples(no_pres_test_treebank,len(no_pres_test_treebank),n_features,Pres=False) #tsizeMB = sum(i.size*i.itemsize for i in (X_train,X_test))/2**20. #blosc_args = bp.DEFAULT_BLOSC_ARGS #blosc_args['clevel'] = 6 t = time.time() bp.pack_ndarray_to_file(y_train, '../data/y_train.blp') bp.pack_ndarray_to_file(y_test, '../data/y_test.blp') #print(y_test) #print(type(y_test)) #print(y_test.shape) #t1 = time.time() - t #print("store time = %.2f (%.2f MB/s)" % (t1, tsizeMB/t1)) #t = time.time() X_train = sp.csr_matrix(X_train) X_test = sp.csr_matrix(X_test) sp.save_npz('../data/X_train.npz', X_train) sp.save_npz('../data/X_test.npz', X_test) t1 = time.time() - t print("store time = %.2f " % (t1))
n_features, Pres=True, y_test_Pres=True) no_pres_X_train, no_pres_y_train = make_samples(no_pres_train_samples, len(no_pres_train_samples), n_features, Pres=False, y_test_Pres=False) #no_pres_X_test,no_pres_y_test = make_samples(no_pres_test_treebank,len(no_pres_test_treebank),n_features,Pres=False,y_test_Pres=False) #tsizeMB = sum(i.size*i.itemsize for i in (X_train,X_test))/2**20. #blosc_args = bp.DEFAULT_BLOSC_ARGS #blosc_args['clevel'] = 6 t = time.time() bp.pack_ndarray_to_file(y_train, '../data/binM_y_train.blp') bp.pack_ndarray_to_file(y_test, '../data/binM_y_test.blp') bp.pack_ndarray_to_file(no_pres_y_train, '../data/binM_no_pres_y_train.blp') #bp.pack_ndarray_to_file(y_test, '../data/p2_y_test.blp') #print(y_test) #print(type(y_test)) #print(y_test.shape) #t1 = time.time() - t #print("store time = %.2f (%.2f MB/s)" % (t1, tsizeMB/t1)) #t = time.time() X_train = sp.csr_matrix(X_train) X_test = sp.csr_matrix(X_test) sp.save_npz('../data/binM_X_train.npz', X_train) sp.save_npz('../data/binM_X_test.npz', X_test) no_pres_X_train = sp.csr_matrix(no_pres_X_train)
#-*-coding:utf-8-*- import numpy as np import pandas as pd from bert_serving.client import BertClient import time import bloscpack as bp #import pkuseg t=time.time() bc = BertClient() df_test = pd.read_csv('../data/test.tsv', delimiter='\t',header=0,names=['sentID','text_zh']) #df_test = pd.read_csv('../data/test.tsv', delimiter='\t',usecols=[0,3],header=None) #df_test.columns=['sentID','text_zh'] test_X_list = df_test['text_zh'].values.tolist() test_idx = np.array(df_test['sentID']) #print(test_X_list[:2]) #print(type(dev_X_list[:2])) list_vec = bc.encode(test_X_list) print(list_vec.shape) #print(test_idx.shape) bp.pack_ndarray_to_file(list_vec, '../data/fine_bert_test_X.blp') bp.pack_ndarray_to_file(test_idx, '../data/fine_bert_test_idx.blp') t1=time.time() - t print("#conversion time: ",t1)
#bp.pack_ndarray_to_file(dat, '../input/tst_dat_neighbour_quantile_g{:d}_w{:d}.bp'.format(i, wndw)) del feature_extraction_ndcs del sgnl_L, sgnl_R del feat_L, dat_L, rank_L, sort_L del feat_R, dat_R, rank_R, sort_R del feat #, dat gc.collect() elapsed = (datetime.now() - start_time).seconds print('total time elapsed {:d} minutes {:d} seconds.'.format( elapsed // 60, elapsed % 60)) bp.pack_ndarray_to_file( np.concatenate(tst_dat_collection, 0), '../input/tst_dat_neighbour_quantile_all_w{:d}.bp'.format(wndw)) # ================================================================== trn_dat_collection = [] for i in range(10): print('============================') print('processing group #{:d}...'.format(i)) sgnl_ndcs = batch_id_trn[i] sgnl = pdf_trn['signal'].iloc[sgnl_ndcs] trgt = pdf_trn['open_channels'].iloc[sgnl_ndcs] sgnl_L = pd.concat([pd.Series([np.nan] * (wndw - 1)), sgnl]) sgnl_R = pd.concat([sgnl, pd.Series([np.nan] * (wndw - 1))])