def save_pred_as_submit_format(pred_path, output_file, col_name=('ID', "TARGET")): print ('writing prediction as submission format') print ('read prediction <{}>'.format(pred_path)) pred = paratext.load_csv_to_pandas(pred_path, allow_quoted_newlines=True).values #(((test.mean(1) - test.mean(1).mean())/test.mean(1).std()/100. + 0.5).values + pred)/2.0 submission = pd.read_csv(INPUT_PATH+SUBMIT_FORMAT) submission[col_name[1]] = pred submission.to_csv( output_file, columns = col_name, index = None ) print ('done writing') return
def save_pred_as_submit_format(pred_path, output_file, col_name=('ID', "TARGET")): print 'writing prediction as submission format' print 'read prediction <{}>'.format(pred_path) pred = paratext.load_csv_to_pandas(pred_path, allow_quoted_newlines=True).values #(((test.mean(1) - test.mean(1).mean())/test.mean(1).std()/100. + 0.5).values + pred)/2.0 submission = pd.read_csv(INPUT_PATH+SUBMIT_FORMAT) submission[col_name[1]] = pred submission.to_csv( output_file, columns = col_name, index = None ) print 'done writing' return
def run(parallel=False, pool_size=-1, base_parser=BaseParser.pandas): file_name = '../../../ATTIC/data/lineitem.csv' expected_row_count = 6001216 # file_name = '../../../ATTIC/data/customer.csv' # expected_row_count = 150001 f = open(file_name) file_size = os.path.getsize(file_name) if base_parser is not BaseParser.paratext: parser = CSVParser(callback, parallel, pool_size, base_parser) start_time = timeit.default_timer() while True: chunk = f.read(READ_CHUNK_SIZE) if chunk: parser.pump(chunk) else: break parser.close() row_count = parser.line_count stop_time = timeit.default_timer() else: start_time = timeit.default_timer() df = paratext.load_csv_to_pandas(file_name, num_threads=pool_size) stop_time = timeit.default_timer() row_count = len(df) + 1 time = stop_time - start_time print("Time: {}, Rows: {}, Size: {}, MB/Sec {}".format( time, row_count, file_size, (file_size / time) / 1000 / 1000)) assert expected_row_count == row_count
def load_data(flist, drop_duplicates=False): ''' Usage: set train, target, and test key and feature files. FEATURE_LIST_stage2 = { 'train':( TEMP_PATH + 'v1_stage1_all_fold.csv', TEMP_PATH + 'v2_stage1_all_fold.csv', TEMP_PATH + 'v3_stage1_all_fold.csv', ),#target is not in 'train' 'target':( INPUT_PATH + 'target.csv', ),#target is in 'target' 'test':( TEMP_PATH + 'v1_stage1_test.csv', TEMP_PATH + 'v2_stage1_test.csv', TEMP_PATH + 'v3_stage1_test.csv', ), } ''' if (len(flist['train'])==0) or (len(flist['target'])==0) or (len(flist['test'])==0): raise Exception('train, target, and test must be set at \ least one file, respectively.') X_train = pd.DataFrame() test = pd.DataFrame() print ('Reading train dataset') for i in flist['train']: # why need many train data? X_train = pd.concat([X_train, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1) print ('train dataset is created') print ('Reading target data') y_train = paratext.load_csv_to_pandas(PATH+flist['target'][0], allow_quoted_newlines=True)['target'] print ('Reading train dataset') for i in flist['test']: test = pd.concat([test, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1) #del test['t_id'] #print X_train.columns #print test.columns assert( (False in X_train.columns == test.columns) == False) print ('train shape :{}'.format(X_train.shape)) if drop_duplicates == True: #delete identical columns unique_col = X_train.T.drop_duplicates().T.columns X_train = X_train[unique_col] test = test[unique_col] assert( all(X_train.columns == test.columns)) print ('train shape after concat and drop_duplicates :{}'.format(X_train.shape)) # drop constant features #X_train = X_train.loc[:, (X_train != X_train.ix[0]).any()] #test = test.loc[:, (test != test.ix[0]).any()] #common_col = list(set(X_train.columns.tolist()) and set(test.columns.tolist())) #X_train = X_train[common_col] #test = test[common_col] #print 'shape after dropping constant features: {}'.format(X_train.shape) return X_train, y_train, test
# Scikit learn import sklearn as sk from sklearn import metrics from sklearn.model_selection import train_test_split from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # IO import paratext # ## Read Data # In[125]: print('Reading data') #df_all = pd.read_csv('./casted_data_norm.csv', encoding='utf-8') df_all = paratext.load_csv_to_pandas('data/casted_data_norm.csv', in_encoding='utf-8') print('(Read data) End') # In[3]: df_all = df_all.assign(RSP_FLG_N=1 - df_all.RSP_FLG) print(df_all.shape) df_all.head() # In[4]: print(len(df_all.columns.values) - 4) # 600 = 12 * 50 # In[5]:
def load_data(flist, drop_duplicates=False): ''' Usage: set train, target, and test key and feature files. FEATURE_LIST_stage2 = { 'train':( TEMP_PATH + 'v1_stage1_all_fold.csv', TEMP_PATH + 'v2_stage1_all_fold.csv', TEMP_PATH + 'v3_stage1_all_fold.csv', ),#target is not in 'train' 'target':( INPUT_PATH + 'target.csv', ),#target is in 'target' 'test':( TEMP_PATH + 'v1_stage1_test.csv', TEMP_PATH + 'v2_stage1_test.csv', TEMP_PATH + 'v3_stage1_test.csv', ), } ''' if (len(flist['train'])==0) or (len(flist['target'])==0) or (len(flist['test'])==0): raise Exception('train, target, and test must be set at \ least one file, respectively.') X_train = pd.DataFrame() test = pd.DataFrame() print 'Reading train dataset' for i in flist['train']: X_train = pd.concat([X_train, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1) print 'train dataset is created' print 'Reading target data' y_train = paratext.load_csv_to_pandas(PATH+flist['target'][0], allow_quoted_newlines=True)['target'] print 'Reading train dataset' for i in flist['test']: test = pd.concat([test, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1) #del test['t_id'] #print X_train.columns #print test.columns assert( (False in X_train.columns == test.columns) == False) print 'train shape :{}'.format(X_train.shape) if drop_duplicates == True: #delete identical columns unique_col = X_train.T.drop_duplicates().T.columns X_train = X_train[unique_col] test = test[unique_col] assert( all(X_train.columns == test.columns)) print 'train shape after concat and drop_duplicates :{}'.format(X_train.shape) # drop constant features #X_train = X_train.loc[:, (X_train != X_train.ix[0]).any()] #test = test.loc[:, (test != test.ix[0]).any()] #common_col = list(set(X_train.columns.tolist()) and set(test.columns.tolist())) #X_train = X_train[common_col] #test = test[common_col] #print 'shape after dropping constant features: {}'.format(X_train.shape) return X_train, y_train, test