def read_feather(path, nthreads=1): """ Load a feather-format object from the file path .. versionadded 0.20.0 Parameters ---------- path : string file path, or file-like object nthreads : int, default 1 Number of CPU threads to use when reading to pandas.DataFrame .. versionadded 0.21.0 Returns ------- type of object stored in file """ feather = _try_import() path = _stringify_path(path) if LooseVersion(feather.__version__) < LooseVersion('0.4.0'): return feather.read_dataframe(path) return feather.read_dataframe(path, nthreads=nthreads)
def _ft(self, tblname, dbname=None, type=None, df=None): if type is None: type = self.type if dbname is None: dbname = self.name if df is None: # return the dataframe if it exists df = ft.read_dataframe( os.path.expanduser( os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname)) ) ) if "idx" in df.columns.values: df.set_index("idx", drop=True, inplace=True) df.index.name = None return df else: if not (df.index.dtype_str == "int64") and not (df.empty): df = df.copy() df["idx"] = df.index ft.write_dataframe( df, os.path.expanduser( os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname)) ), ) if "idx" in df.columns.values: del df return
def test_integer_with_nulls(self): # pandas requires upcast to float dtype path = random_path() self.test_files.append(path) int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] num_values = 100 writer = FeatherWriter(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 expected_cols = [] for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) writer.write_array(name, values, null_mask) expected = values.astype('f8') expected[null_mask] = np.nan expected_cols.append(expected) ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), columns=int_dtypes) writer.close() result = feather.read_dataframe(path) assert_frame_equal(result, ex_frame)
def test_float_nulls(self): num_values = 100 path = random_path() self.test_files.append(path) writer = FeatherWriter(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 dtypes = ['f4', 'f8'] expected_cols = [] for name in dtypes: values = np.random.randn(num_values).astype(name) writer.write_array(name, values, null_mask) values[null_mask] = np.nan expected_cols.append(values) writer.close() ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), columns=dtypes) result = feather.read_dataframe(path) assert_frame_equal(result, ex_frame)
def mergeFeathers(files, mergedFilename, writeCSV, deleteSource=True): data = [feather.read_dataframe(f) for f in files if not f == ''] if len(data) > 0: df = pd.concat(data, sort=False, axis=0, ignore_index=True, copy=False) else: print('mergeFeathers: No files to merge!') return '' if writeCSV: df.to_csv(mergedFilename) else: try: feather.write_dataframe(df, mergedFilename) except: print('Error writing merged feather: Trying CSV') print(df.shape) traceback.print_exc() try: df.to_csv(mergedFilename.replace('.feather', '.csv')) except: print('Error writing merged CSV: Writing list of unmerged temp files.') with open(mergedFilename.replace('.feather', '.csv'), 'w') as fh: for f in files: fh.write(f + '\n') deleteSource = False if deleteSource: for f in files: if not f == '': try: os.remove(f) except: print('Could not delete merged temp file: %s' % f) return mergedFilename
def test_factor_rep(): fpath1 = util.random_path() fpath2 = util.random_path() rcode = """ library(feather) iris <- read_feather("{0}") iris$Species <- as.factor(as.character(iris$Species)) write_feather(iris, "{1}") """.format(fpath1, fpath2) tmp_paths = [] try: iris = pd.read_csv('iris.csv') levels = ['setosa', 'versicolor', 'virginica'] iris['Species'] = pd.Categorical(iris['Species'], categories=levels) feather.write_dataframe(iris, fpath1) util.run_rcode(rcode) result = feather.read_dataframe(fpath2) tmp_paths.extend([fpath1, fpath2]) assert_frame_equal(result, iris) finally: util.remove_paths(tmp_paths)
def __getitem__(self, key): fn = self._fn_cache[key] ret = feather.read_dataframe(fn) self._heap_map[key][0] = time.time() # ensure the heap invariant heapq.heapify(self._heap) return ret
def load_df(path): if file_format(path) != 'feather': return default_csv_loader(path) elif featherpmm and feather: ds = featherpmm.read_dataframe(path) return ds.df elif feather: return feather.read_dataframe(path) else: raise Exception('The Python feather module is not installed.\n' 'Use:\n pip install feather-format\n' 'to add capability.\n')
def main(): path = os.path.expanduser(sys.argv[1]) ratings_df = feather.read_dataframe(path) num_ratings = ratings_df.shape[0] ratings = np.concatenate((np.array(ratings_df['user_id'], dtype=pd.Series).reshape(num_ratings, 1), np.array(ratings_df['item_id'], dtype=pd.Series).reshape(num_ratings, 1), np.array(ratings_df['rating'], dtype=pd.Series).reshape(num_ratings, 1)), axis=1) global_mean = mean(ratings[:,2]) np.random.seed(12) ratings_tr, ratings_val = train_test_split(ratings, train_size=.7) max_iter = int(sys.argv[2]) to_learn = sys.argv[3] num_users = np.unique(ratings[:,0]).shape[0] num_items = np.unique(ratings[:,1]).shape[0] if to_learn == "user_bias_lda": lda = learn_bias_lda(ratings_tr, 4, [2,4,6,8,10], num_users, num_items, global_mean, max_iter) print("Best lambda for user bias is %s" %(lda)) elif to_learn == "item_bias_lda": lda = learn_bias_lda(ratings_tr, 4, [2,4,6,8,10], num_users, num_items, global_mean, max_iter, False) print("Best lambda for item bias is %s" %(lda)) elif to_learn == "user_bias": lda = float(sys.argv[4]) user_bias = get_user_bias(ratings_tr, ratings_val, lda, num_users, num_items, global_mean, max_iter) np.save("user_bias", user_bias) elif to_learn == "item_bias": lda = float(sys.argv[4]) item_bias = get_item_bias(ratings_tr, ratings_val, lda, num_users, num_items, global_mean, max_iter) np.save("item_bias", item_bias) elif to_learn == "item_bias_fixed_user": lda = float(sys.argv[4]) user_bias = np.load("user_bias.npy") tr, val, finalw, finalh = learn_item_bias_from_fixed_user_bias(ratings_tr, ratings_val, np.load("user_bias.npy"), num_items, lda, global_mean, max_iter) print("Final training RMSE %s" % (tr)) print("Final validation RMSE %s" % (val)) np.save("item_bias_fixed_user", finalh[1,:].reshape(num_items,)) elif to_learn == "features": lda = float(sys.argv[4]) rank = int(sys.argv[5]) user_bias = np.load("user_bias.npy").reshape(num_users, 1) item_bias = np.load("item_bias.npy").reshape(1, num_items) W, H, reg = create_factors_with_biases(user_bias, item_bias, rank, lda) tr, val, finalw, finalh = mf(ratings_tr, ratings_val, W, H, reg, global_mean, max_iter, 1.0, True) print("Final training RMSE %s" % (tr)) print("Final validation RMSE %s" % (val)) np.save("final_w", finalw) np.save("final_h", finalh) elif to_learn == "features-only": lda = float(sys.argv[4]) rank = int(sys.argv[5]) W, H, reg = create_factors_without_biases(num_users, num_items, rank, lda) tr, val, finalw, finalh = mf(ratings_tr, ratings_val, W, H, reg, global_mean, max_iter, 1.0, True) print("Final training RMSE %s" % (tr)) print("Final validation RMSE %s" % (val)) np.save("final_w", finalw) np.save("final_h", finalh)
def maybe_parse(path): feather_file = path + ".feather" if os.path.exists(feather_file): print("loading %s from cache" % path) df = feather.read_dataframe(feather_file) df = df.set_index("ut_ms") return df else: print("parsing %s" % path) df = parse(path) feather.write_dataframe(df.reset_index(), feather_file) return df
def _check_pandas_roundtrip(self, df, expected=None): path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) if not os.path.exists(path): raise Exception('file not written') result = feather.read_dataframe(path) if expected is None: expected = df assert_frame_equal(result, expected)
def matchSamples(batchFolder, matchStr='*.feather', test=False): """Match each row of the metadata with each feather file (sample) in the batch folder""" mDf = pd.read_csv(opj(batchFolder, 'metadata.csv')) featherList = glob(opj(batchFolder, matchStr)) if len(featherList) == 0: print('No feather files matching "%s" in "%s"' % (matchStr, batchFolder)) return {} featherLU = {sample_name:[fn for fn in featherList if sample_name in fn] for sample_name in mDf.sample_name} fallback = False if not len(featherLU) == mDf.shape[0]: print('Could not match all samples in the metadata.') fallback = True L = pd.Series({k:len(v) for k,v in featherLU.items()}) if not (L == 1).all(): print('Some samples in metadata matched to >1 feather file:') for k,v in featherLU.items(): if len(v) > 1: print('\t%s: %s' % (k, v[:2])) fallback = True if fallback: featherLU = {} print('Attempting to use sample order with check on total event count.') for i,sample_name in enumerate(mDf.sample_name): events = int(sample_name.split('_')[-1]) fn = [f for f in featherList if 'gs_%d_' % (i + 1) in f][0] f = feather.read_dataframe(opj(batchFolder, fn)) if events == f.shape[0]: featherLU.update({sample_name:fn}) print('Matched %s to %s. (%d of %d)' % (sample_name, fn, i+1, mDf.shape[0])) if test and (i + 1) >= 2: break else: print('Sample order strategy not working.') break else: featherLU = {k:v[0] for k,v in featherLU.items()} if not len(featherLU) == mDf.shape[0]: print('Could not match all samples in the metadata.') if test: out = {} i = 0 for k,v in featherLU.items(): out.update({k:v}) i += 1 if i >= 2: break featherLU = out return featherLU
def mergeSamples(batchFolder, extractionFunc, extractionKwargs, matchStr='*.feather', test=False, metaCols=None, filters=None): """Go through each feather file (sample) in a batch folder, apply the analysis function, and merge together.""" mDf = pd.read_csv(opj(batchFolder, 'metadata.csv')) featherList = glob(opj(batchFolder, matchStr)) featherLU = matchSamples(batchFolder, matchStr=matchStr, test=test) if not metaCols is None: if not 'sample_name' in metaCols: metaCols.append('sample_name') mDf = mDf[metaCols] mDf = mDf.set_index('sample_name') feathers = [] i = 1 print('Extracting from batch %s (%s)' % (batchFolder, time.ctime())) sttime = time.time() for sample_name, fn in featherLU.items(): filterOut = False if not filters is None: """Keep only samples whose meta data matches all of the filters""" filterOut = False for col, valList in filters.items(): if not mDf.loc[sample_name, col] in valList: filterOut = True break if not filterOut: f = feather.read_dataframe(fn) # print('Extracting from sample %s (%d of %d)' % (sample_name, i, len(featherLU))) try: x = extractionFunc(f, **extractionKwargs) x.loc[:, 'sample_name'] = sample_name except: print('Error extracting from batch %s, sample %s (%d)' % (batchFolder, sample_name, i)) print(x.shape) print(x.head()) traceback.print_exc() feathers.append(x) i += 1 if len(feathers) > 0: outDf = pd.merge(pd.concat(feathers, axis=0), mDf.reset_index(), how='left', left_on='sample_name', right_on='sample_name') print('Finished batch %s (%1.0f minutes)' % (batchFolder, (time.time() - sttime) / 60), flush=True) """Write to a temporary merge file and return filename""" with tempfile.NamedTemporaryFile(mode='w', suffix='.feather', prefix='merged_tmp_', dir=batchFolder, delete=False) as fh: tmpFilename = fh.name feather.write_dataframe(outDf, tmpFilename) else: tmpFilename = '' return tmpFilename
def main(argv): args = _argument_parser().parse_args(argv) if args.data_frame is not None and os.path.exists(args.data_frame): df = feather.read_dataframe(args.data_frame) else: from . import parsers parser = getattr(parsers, args.format).parser print('reading network data') network = parser(args.datafile, max_num_nodes=args.max_num_nodes) print('extracting data') df = network_properties(network, in_degree_threshold=args.in_degree_threshold, pagerank_threshold=args.pagerank_threshold, damping=args.damping) if args.data_frame is not None: feather.write_dataframe(df, args.data_frame) print('preparing plots') bokeh_plot(df, output=args.output_file, loglog=args.loglog)
def read_feather(path): """ Load a feather-format object from the file path .. versionadded 0.20.0 Parameters ---------- path : string File path Returns ------- type of object stored in file """ feather = _try_import() return feather.read_dataframe(path)
def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None): if path is None: path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) if not os.path.exists(path): raise Exception("file not written") result = feather.read_dataframe(path, columns) if expected is None: expected = df assert_frame_equal(result, expected) if null_counts is None: null_counts = np.zeros(len(expected.columns)) np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
def test_boolean_nulls(self): # pandas requires upcast to object dtype path = random_path() self.test_files.append(path) num_values = 100 np.random.seed(0) writer = FeatherWriter(path) mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 writer.write_array('bools', values, mask) expected = values.astype(object) expected[mask] = None writer.close() ex_frame = pd.DataFrame({'bools': expected}) result = feather.read_dataframe(path) assert_frame_equal(result, ex_frame)
# -*- coding: utf-8 -*- """ Created on Mon Jun 13 15:50:38 2016 @author: Mia """ import pandas as pd import feather # Read data file into a python array review_df = feather.read_dataframe('../parsed_data/filtered_review_data.feather', 'rb') review_grouped = review_df.groupby(['city'], sort = True).count() review_cities = review_grouped.sort_values('text',ascending=False) # Read data file into a python array tip_df = feather.read_dataframe('../parsed_data/filtered_tip_data.feather', 'rb') tip_grouped = tip_df.groupby(['city'], sort = True).count() tip_cities = tip_grouped.sort_values('text',ascending=False)
# checking mtv features data ''' print('Checking mtv features data...') ok = True for f in features: for name in ['0', '1', 'test', 'rank_0', 'rank_1', 'rank_test']: filename = 'features/mtv/%s_pred_%s.npy' % (f, name) if not os.path.isfile(filename): print(' + Missing %s!' % filename) ok = False if not ok: sys.exit(1) ''' df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') df_test = feather.read_dataframe('tmp/clicks_test.feather') df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1) df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1) del df_train_0['fold'], df_train_1['fold'], df_all gc.collect() # training a small model to select best features # first, load the data df_train = df_train_0[:2000000].copy() df_val = df_train_1[:1000000].copy() for f in features: print('loading data for %s...' % f)
import feather import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns app_trn = feather.read_dataframe('../input/application_train.feather') def target_bar(col, title): df0 = app_trn[app_trn["TARGET"] == 0] df1 = app_trn[app_trn["TARGET"] == 1] t0 = df0[col].value_counts().rename(col + '0') t1 = df1[col].value_counts().rename(col + '1') t = pd.concat([t0, t1], axis=1).fillna(0).astype(int) t['total'] = t.sum(axis=1) t.sort_values('total', inplace=True, ascending=False) t.drop(columns=['total'], inplace=True) idx = np.arange(len(t)) width = 0.35 fig, ax = plt.subplots(figsize=(12, 6)) ax.bar(idx, t[col + '0'], width) ax.bar(idx+width, t[col + '1'], width) ax.set_title('Scores by group and gender')
import os #Convert ohio voter data from csv to feather dataframe data_path = '/Volumes/FileStorage/Insight_data/Ohio_data/data_csv/' output_path = '/Volumes/FileStorage/Insight_data/Ohio_data/data_feather/' oh_f_1 = 'SWVF_1_22' oh_f_2 = 'SWVF_23_44' oh_f_3 = 'SWVF_45_66' oh_f_4 = 'SWVF_67_88' #save2feather(data_path,oh_f_1,output_path) #save2feather(data_path,oh_f_2,output_path) #save2feather(data_path,oh_f_3,output_path) #save2feather(data_path,oh_f_4,output_path) df_oh1 = feather.read_dataframe(output_path + oh_f_1 + '.feather') df_oh2 = feather.read_dataframe(output_path + oh_f_2 + '.feather') df_oh3 = feather.read_dataframe(output_path + oh_f_3 + '.feather') df_oh4 = feather.read_dataframe(output_path + oh_f_4 + '.feather') oh_df = pd.concat([df_oh1, df_oh2, df_oh3, df_oh4]) #oh_df = feather.read_dataframe(output_path+oh_f_1+'.feather') #sub_df = df.iloc[:150] #feather.write_dataframe(sub_df,output_path+'subset_oh.feather') #RESIDENTIAL_ADDRESS1RESIDENTIAL_SECONDARY_ADDRRESIDENTIAL_CITYRESIDENTIAL_STATERESIDENTIAL_ZIP oh_unique = oh_df.drop_duplicates(subset=[ 'RESIDENTIAL_ADDRESS1', 'RESIDENTIAL_CITY', 'RESIDENTIAL_STATE', 'RESIDENTIAL_ZIP' ])
# reading the leaked documents docs_size = {} leak_uuid_dict = {} with open("tmp/leaked_docs.csv") as f: reader = csv.DictReader(f) leak_uuid_dict = {} for row in reader: doc_id = int(row['document_id']) uuids = row['uuids'].split(' ') leak_uuid_dict[doc_id] = set(uuids) docs_size[doc_id] = len(uuids) df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') df_test = feather.read_dataframe('tmp/clicks_test.feather') # getting user ids and document ids df_events = pd.read_csv('data/events.csv.zip', usecols=['uuid']) df_ads = pd.read_csv('data/promoted_content.csv.zip', usecols=['ad_id', 'document_id']) # joining doc_id and ad_id ad_to_idx = dict(zip(df_ads.ad_id, df_ads.index)) ad_idx = df_all.ad_id.apply(ad_to_idx.get) ad_document_id = df_ads.document_id.iloc[ad_idx].reset_index(drop=1) df_all['ad_document_id'] = ad_document_id
import os import pandas as pd import feather os.getcwd() fp = os.getcwd().replace("feature_eng", "") train = feather.read_dataframe(fp + "data/train.feather") df = pd.pivot_table( train, values="Demanda_uni_equil", index=[ "Cliente_ID", "Producto_ID", "Agencia_ID", "Canal_ID", "Ruta_SAK", "Venta_uni_hoy", "Venta_hoy", "Dev_uni_proxima", "Dev_proxima", ], columns="Semana", ) df = df.reset_index() feather.write_dataframe(df, fp + "data/week_split_train.feather")
#!/usr/bin/env python import pandas as pd import numpy as np import random import pylab as pl from scipy import optimize import time import feather import csv df = feather.read_dataframe('../data/seguimiento_audiencias_val.feather') # Arreglamos manualmente dos errorcitos df[['erd3_litigios', 'era3_litigios']] = df[['erd3_litigios', 'era3_litigios']].apply(pd.to_numeric, errors='coerce') df['id'] = df['junta'].map(str) + '_' + df['expediente'].map( str) + '_' + df['anio'].map(str) # Crear dos funciones de validación. Para cada variable, validar # y luego pegarle el nombre de la variable que falló, si no cumple con el criterio def valida_cat(var, rango): df[var][(~df[var].isin(rango)) & (~pd.isnull(df[var]))] = var + '_rango' return [df[var]] def valida_na(var): df[var][pd.isnull(df[var])] = var + '_na'
try: freqs = np.array( p['freqs'] )/3600 except KeyError: T = (3600*23) if (p['time_stop'] > 3600*24) else p['time_stop'] ff = 1/T fn = 1/(2*p['time_step']) if( 1/(2*p['time_step']) < (4/3600) ) else (4/3600) freqs = np.arange(ff,fn,ff) """ Wavenumber range is determined from frequency range using the frequency range -Compute wave directions from dk resolution from nyquist frequency """ depth = np.linspace(0,p['depth_end'],p['depth_res']) N2 = feather.read_dataframe(p['envfile'])['strat'] K = [] for i in range(len(freqs)): iwm = InternalWaveModes(depth,N2,freq=freqs[i]) K.append([iwm.get_hwavenumber(m) for m in p['modes']] ) K = np.array(K).flatten() print(K) p['K'] = list(K.real) #Fundamental wavenumber dk = K[0].real headings = [] for kmag in K:
jobs_red = [] pq_red = mp.Queue() jobs_blue = [] pq_blue = mp.Queue() # MC calculation for i in range(Nbins): # for i in range(1): # simulated data inpathSim_whole = inDirSim + "SimCatSelec_tomo" + str(i+1) +'.feather' inpathSim_red = inDirSim + "SimCatSelec_tomo" + str(i+1) +'_TB9_in_less3.feather' inpathSim_blue = inDirSim + "SimCatSelec_tomo" + str(i+1) +'_TB9_in_greater3.feather' # dataSim_whole = feather.read_dataframe(inpathSim_whole) dataSim_red = feather.read_dataframe(inpathSim_red) dataSim_blue = feather.read_dataframe(inpathSim_blue) # real data inpathReal_whole = inDirReal + 'tomo/all_tomo' + str(i+1) +'.feather' inpathReal_red = inDirReal + 'split/all_tomo' + str(i+1) +'_T_B_less3.feather' inpathReal_blue = inDirReal + 'split/all_tomo' + str(i+1) +'_T_B_greater3.feather' # dataReal_whole = feather.read_dataframe(inpathReal_whole) dataReal_red = feather.read_dataframe(inpathReal_red) dataReal_blue = feather.read_dataframe(inpathReal_blue) p_whole = mp.Process(target=mCalFunc, args=(i+1, dataSim_whole, dataReal_whole, Nbin1, Nbin2, pq_whole)) p_red = mp.Process(target=mCalFunc, args=(i+1, dataSim_red, dataReal_red, Nbin1, Nbin2, pq_red)) p_blue = mp.Process(target=mCalFunc, args=(i+1, dataSim_blue, dataReal_blue, Nbin1, Nbin2, pq_blue))
def __init__(self, name, comment=None, remove_columns=None, param=None, xgb_seed=None, n_estimators=1000, log=None, predict_feats=False, debug=True): self.name = name self.comment = comment if log is None: self.logfile = open('../output/log/{}.txt'.format(name), 'w') else: self.logfile = open('../output/log/{}.txt'.format(log), 'w') if param is None: self.param = { 'objective': 'reg:linear', 'metric': 'rmse', 'booster': 'gbtree', 'learning_rate': 0.02, 'max_depth': 22, 'min_child_weight': 57, 'gamma': 1.45, 'alpha': 0.0, 'lambda': 0.0, 'subsample': 0.67, 'colsample_bytree': 0.054, 'colsample_bylevel': 0.50, 'n_jobs': -1, 'random_state': 456, 'sead': 6 #'verbose': 100, } else: self.param = param if xgb_seed is not None: self.param['seed'] = xgb_seed self.param['n_estimators'] = n_estimators self.feature_importance_df = None self.regressors = [] self.x = feather.read_dataframe(BASE_X_PATH) self.trn_preds_feats = np.load(TRN_PRED_FEATS) self.tes_preds_feats = np.load(TES_PRED_FEATS) if remove_columns is not None: drop_features = [ _f for _f in self.x.columns if _f in remove_columns ] self.x.drop(drop_features, axis=1, inplace=True) del drop_features gc.collect() #read & prepare datasets print('read & prepare datasets shape: {}'.format(self.x.shape)) #split train & test sets self.x_train, self.y_train, self.x_test, self.y_train_ag = prep_and_split( self.x) #debug if debug: x_train_s = self.x_train.sample(frac=0.3) x_test_s = self.x_test.sample(frac=0.3) y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin( x_train_s.index)] else: x_train_s = self.x_train.sample(frac=1) x_test_s = self.x_test.sample(frac=1) y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin( x_train_s.index)] if predict_feats: self.x_train, trn_feats = add_pred_feats(x_train_s, self.trn_preds_feats, None) self.x_test, _ = add_pred_feats(x_test_s, self.tes_preds_feats, trn_feats) self.y_train = y_train_s.groupby('fullVisitorId').sum() del x_train_s, x_test_s, y_train_s gc.collect() else: self.x_train.reset_index(drop=True, inplace=True) self.x_test.reset_index(drop=True, inplace=True)
# In[1]: import pandas as pd import numpy as np import feather from tqdm import tqdm # In[2]: from outliers import remove_outliers # In[3]: df_pays = feather.read_dataframe('data/df_pays_na_test.feather') # In[4]: shops = df_pays.shop_id.unique() shops = sorted(shops) # In[5]: from fbprophet import Prophet # In[7]: def add_prophet_features(df_shop): df = df_shop[['day', 'pays_count']].rename(columns={
""" import os os.chdir('D:/yh_min-mfactors') from alphaFuncs_min_240 import * from address_data import * import pandas as pd import feather as ft ################ 因子计算结果没有 '600485.SH'(停牌) # 先验证分钟行情数据的时间范围,分钟和日范围 files = os.listdir(add_min_file) res = [] for filename in files: df = ft.read_dataframe(add_min_file+filename, nthreads=100) ls = list(df['date']) res = list(set(res+ls)) #验证结果,每个股票的时间都是完整的,468480条分钟数据 datetimel = sorted(res)[408000:] #从2017年开始,60480条分钟线数据,每个股票的时间也是完整的 datel = list(set(map(lambda x : x[:10],datetimel))) # 日数据252条,20170103-20180115 # 沪深300指数成分股 code_HS300 = pd.read_excel(add_gene_file + 'data_mkt.xlsx',sheetname='HS300') stockList = list(code_HS300['code'][:]) # 分钟线:从2017-01-03 09:31:00 至 2018-01-15 15:00:00 dateList = open(add_mintime_SerialFile).read().split('\n') alpha_all(stockList, dateList, savepath=add_alpha_min_expand_file)
# -*- coding: utf-8 -*- """ Created on Sun May 29 21:34:10 2016 @author: mariaathena """ # Prepare environment and load data ------------------------------------------ import pandas as pd import numpy as np import feather cosim_df = feather.read_dataframe('../parsed_data/event_cosine_sim.feather') # Modify data for easy visualisation ----------------------------------------- # Set cos_sim below certain threshold equal to zero # threshold for each topic == topic's 75th percentile cosine sim cosim_df2 = cosim_df.copy() # cosim_df2.ix[:,3:] = cosim_df2.ix[:,3:].applymap(lambda x: round(x, 2) if x > 0.01 else 0) cosim_df2.benghazi = cosim_df2.benghazi.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.benghazi, 75) else 0) cosim_df2.wiki_leak = cosim_df2.wiki_leak.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.wiki_leak, 75) else 0) cosim_df2.doctrine = cosim_df2.doctrine.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.doctrine, 75) else 0) cosim_df2.arab_spring = cosim_df2.arab_spring.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.arab_spring, 75) else 0) cosim_df2.russian_reset = cosim_df2.russian_reset.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.russian_reset, 75) else 0) cosim_df2.cancer = cosim_df2.cancer.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.cancer, 75) else 0) ## Set emails topic == event with the highest cosine similarity to
out_dict = { "model_vars_": model_files, "com_formed_": com_files, #"agent_vars_":ag_files } # Loop through container dictionaries for key, val in out_dict.items(): for input_file in glob.glob(join(data_dir, '*' + key + '.feather')): # Create a label from the name of the experiment that created the data label = "_".join(input_file.split("_")[7:8]) print(label) # Read data and store it in the container dictionary with open(input_file, "r") as mydata: val[label] = feather.read_dataframe(input_file) # List of parameters included in the scenario labels cal_pars = [ "w_econ", "w_swn", "w_att", "w_subplot", "threshold", "reduction", "awareness_mean", "awareness_stdev", "awareness_minergie" ] pars_d = {"cal_label": cal_pars} # Rename second column in analysed files to "variable" for key, df in out_dict["model_vars_"].items(): df.rename(columns={"Unnamed: 0": 'sim_year'}, inplace=True) # Put all data frames into one model_df = pd.concat(out_dict["model_vars_"])
import numpy as np import pandas as pd from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.normalization import BatchNormalization from keras.optimizers import Adadelta from itertools import product import datetime # In[8]: train=feather.read_dataframe("../data/train_set.feather") test=feather.read_dataframe("../data/test_set.feather") result=test[['id']].copy() train_label=train['class'].values lb = LabelEncoder() lb.fit(train['class'].values) # In[9]: nb_classes =19 dims = x_train.shape[1] epochs = 15 # parameter grids param_grid = [
### Importing required packages import pandas as pd import feather from pyproj import Proj, transform import calendar as cldr from geopy.geocoders import Nominatim import datetime import numpy as np pd.options.display.max_rows = 999 pd.options.display.max_columns = 999 # Importing feather datasets FL_df = feather.read_dataframe("feather_files\FL_raw.feather") VA_df = feather.read_dataframe("feather_files\VA_raw.feather") PA_df = feather.read_dataframe("feather_files\PA_raw.feather") OR_df = feather.read_dataframe("feather_files\OR_raw.feather") OR_loc_df = feather.read_dataframe("feather_files\OR_locations_raw.feather") NJ_df = feather.read_dataframe(r"feather_files\NJ_raw.feather") NJ_loc_df = feather.read_dataframe(r"feather_files\NJ_locations_raw.feather") MD_df = feather.read_dataframe("feather_files\MD_raw.feather") ID_df = feather.read_dataframe("feather_files\ID_raw.feather") # Importing data Scott Worland compiled for PA, VA, and FL PA_VA_FL_df = pd.read_csv("PA_VA_FL\public_supply_data_pa_va_fl_rev2.csv") # Defining 'uid' column as a string to export to fether
def import_data(data_path, use_pandas=False, intercept=True, valid_fraction=0.2, classification=True): """Import Data for H2O GPU Edition This function will read in data and prepare it for H2O4GPU's GLM solver. Note, the data is assumed to be all numeric,i.e., categoricals are one hot encoded, etc. :param data_path : str A path to a dataset (The dataset needs to be all numeric) :param use_pandas : bool Indicate if Pandas should be used to parse :param intercept : bool Indicate if intercept term is needed :param valid_fraction : float Percentage of dataset reserved for a validation set :param classification : bool Classification problem? :returns If valid_fraction > 0 it will return the following: train_x: numpy array of train input variables train_y: numpy array of y variable valid_x: numpy array of valid input variables valid_y: numpy array of valid y variable family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet" If valid_fraction == 0 it will return the following: train_x: numpy array of train input variables train_y: numpy array of y variable family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet" """ #Can import data using pandas or feather. use_pandas = use_pandas data_file = data_path # If importing using pandas if use_pandas: print("Reading Data with Pandas") data = pd.read_csv(data_file) else: print("Reading Data with Feather") data = feather.read_dataframe(data_file) print(data.shape) data_x = np.array( data.iloc[:, :data.shape[1] - 1], dtype='float32', order='C', copy=False) data_y = np.array( data.iloc[:, data.shape[1] - 1], dtype='float32', order='C', copy=False) #Setup train / validation set split #(assuming form of mxn where m = row count and n = col count) morig = data_x.shape[0] norig = data_x.shape[1] print("Original m=%d n=%d" % (morig, norig)) sys.stdout.flush() #Do train / valid split if valid_fraction > 0: valid_fraction = valid_fraction HO = int(valid_fraction * morig) H = morig - HO print("Size of Train rows=%d & valid rows=%d" % (H, HO)) sys.stdout.flush() train_x = data_x[0:H, :] train_y = data_y[0:H] valid_x = data_x[H:morig, :] valid_y = data_y[H:morig] print("Size of Train cols=%d valid cols=%d" % (train_x.shape[1], valid_x.shape[1])) else: train_x = data_x train_y = data_y #Using intercept if intercept: train_x = np.hstack( [train_x, np.ones((train_x.shape[0], 1), dtype=train_x.dtype)]) if valid_fraction > 0: valid_x = np.hstack( [valid_x, np.ones((valid_x.shape[0], 1), dtype=valid_x.dtype)]) print("Size of Train cols=%d & valid cols=%d after adding " "intercept column" % (train_x.shape[1], valid_x.shape[1])) else: print("Size of Train cols=%d after adding intercept column" % (train_x.shape[1])) if classification: family = "logistic" else: family = "elasticnet" if valid_fraction > 0: return train_x, train_y, valid_x, valid_y, family return train_x, train_y, family
import os import pandas as pd import feather import numpy as np import pickle from sklearn.impute import SimpleImputer path_wd = os.getenv("PATH_WD") path_input_data = os.getenv("PATH_INPUT_DATA") path_output_data = os.getenv("PATH_OUTPUT_DATA") path_output_artifacts = os.getenv("PATH_OUTPUT_ARTIFACTS") path_airlines = os.path.join(path_input_data, "airlines_small_target_selected.feather") pd_airlines = feather.read_dataframe(path_airlines) list_num = [ line.rstrip('\n') for line in open( os.path.join(path_input_data, "airlines_impute_num_vars.txt"), "r") ] list_cat = [ line.rstrip('\n') for line in open( os.path.join(path_input_data, "airlines_impute_cat_vars.txt"), "r") ] imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean") np_airlines_num = imp_mean.fit_transform(pd_airlines[list_num]) pd_airlines_num = pd.DataFrame(np_airlines_num) pd_airlines_num.columns = list_num print(pd_airlines_num.head()) print(pd_airlines_num.isnull().sum(axis=0))
def backtest(y_pred, y_pred_prob, model_name): process = False dir = '../data/Basketball/Team/gamelog/' odds_data_path = '../data/scraped_odds_data.csv' teams = [ 'ATL', 'BOS', 'BRK', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS' ] mapping = { 'Philadelphia 76ers': 'PHI', 'Denver Nuggets': 'DEN', 'Golden State Warriors': 'GSW', 'Milwaukee Bucks': 'MIL', 'Toronto Raptors': 'TOR', 'Los Angeles Clippers': 'LAC', 'San Antonio Spurs': 'SAS', 'Houston Rockets': 'HOU', 'Portland Trail Blazers': 'POR', 'Utah Jazz': 'UTA', 'Detroit Pistons': 'DET', 'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL', 'Indiana Pacers': 'IND', 'Brooklyn Nets': 'BRK', 'Boston Celtics': 'BOS', 'Charlotte Hornets': 'CHO', 'Los Angeles Lakers': 'LAL', 'Sacramento Kings': 'SAC', 'Phoenix Suns': 'PHO', 'Dallas Mavericks': 'DAL', 'New Orleans Pelicans': 'NOP', 'Atlanta Hawks': 'ATL', 'Miami Heat': 'MIA', 'Washington Wizards': 'WAS', 'Minnesota Timberwolves': 'MIN', 'New York Knicks': 'NYK', 'Chicago Bulls': 'CHI', 'Memphis Grizzlies': 'MEM', 'Cleveland Cavaliers': 'CLE', } X_test = feather.read_dataframe('../data/X_test_df.feather') df = X_test y_test = feather.read_dataframe('../data/y_test_df.feather') X_test['predictions'] = y_pred X_test['pred_prob'] = y_pred_prob dodd = pd.read_csv('../data/scraped_odds_data.csv', header=0) monthmap = { 'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 } odds_data = baseline.process_odds_data( dodd, X_test[[ 'Date', 'Location', 'Home', 'Away', 'target', 'predictions', 'unique_id', 'W/L', 'Tm_score', 'Opp_score' ]]) test_df = X_test[[ 'Date', 'Location', 'Home', 'Away', 'target', 'pred_prob', 'predictions', 'unique_id', 'W/L', 'Tm_score', 'Opp_score' ]].copy() odds_data = odds_data.assign( unique_id=lambda x: x['team1'] + '_' + x['team2']) test_df = test_df.assign(home_score=lambda x: x['Tm_score'], away_score=lambda x: x['Opp_score']) test_df.loc[(test_df['Location'] == 'Away'), 'home_score'] = test_df.loc[(test_df['Location'] == 'Away'), 'Opp_score'] test_df.loc[(test_df['Location'] == 'Away'), 'away_score'] = test_df.loc[(test_df['Location'] == 'Away'), 'Tm_score'] test_df['home_score'] = test_df['home_score'].astype(int).astype(str) test_df['away_score'] = test_df['away_score'].astype(int).astype(str) test_df = test_df.assign( score=lambda x: x['home_score'] + ':' + x['away_score']) odds_data['new_score'] = odds_data['score'].apply( lambda x: x.replace('OT', '')) df_rets = test_df.merge(odds_data, how='left', left_on=['unique_id', 'score'], right_on=['unique_id', 'new_score']) df_rets_drop = df_rets.dropna(subset=['odds1', 'odds2']) # df_rets_drop = df_rets_drop.loc[(df_rets_drop['odds1'] > -500) # & (df_rets_drop['odds2'] > -500) # & (df_rets_drop['odds1'] != 0) # & (df_rets_drop['odds2'] != 0), :].reset_index(drop = True) # df_rets_drop = df_rets_drop.loc[(df_rets_drop['pred_prob'] > 0.8) | (df_rets_drop['pred_prob'] < 0.2), :].reset_index(drop = True) mark = 0 profit = [] date = [] for i in df_rets_drop.index: if (df_rets_drop.loc[i]['predictions'] == True) and (df_rets_drop.loc[i]['pred_prob'] > 0.7) and (df_rets_drop.loc[i]['odds1'] > -100): # if (df_rets_drop.loc[i]['predictions'] == True): mark += 100 if df_rets_drop.loc[i]['target'] == True: if df_rets_drop.loc[i]['odds1'] < 0: earned = -100 * 100 / df_rets_drop.loc[i]['odds1'] # profit += -100 * 100 / df_rets_drop.loc[i]['odds1'] else: earned = df_rets_drop.loc[i]['odds1'] else: earned = -100 # profit -= 100 date.append(df_rets_drop.loc[i]['Date']) profit.append(earned) if (df_rets_drop.loc[i]['predictions'] == False) and (df_rets_drop.loc[i]['pred_prob'] < 0.3) and (df_rets_drop.loc[i]['odds2'] > -100): # if (df_rets_drop.loc[i]['predictions'] == False): mark += 100 if df_rets_drop.loc[i]['target'] == True: earned = -100 # profit -= 100 else: if df_rets.loc[i]['odds2'] < 0: earned = -100 * 100 / df_rets_drop.loc[i]['odds2'] # profit += -100* 100 / df_rets_drop.loc[i]['odds2'] else: earned = df_rets_drop.loc[i]['odds2'] date.append(df_rets_drop.loc[i]['Date']) profit.append(earned) profit = np.array(profit) cum_profit = np.array(profit).cumsum() print('Model = {} : Total Profit Is {}'.format(model_name, profit.sum())) result = pd.DataFrame({'profit': cum_profit}, index=date) result.plot(y='profit', title='Cumulative Profit From $100', figsize=(8, 8)) plt.savefig('../output/{}_prob_0.3_0.7.jpg'.format(model_name), format='jpg') plt.show() return profit, result
amount_cumsum = sort_valid_data['amount'].cumsum() all_amount_cumsum = amount_cumsum.values[-1] ax2.plot(ax.get_xticks(), amount_cumsum.values/all_amount_cumsum*100, c = 'g') # ax4 = ax3.twinx() ax2.plot(ax2.get_xticks(), [trade_cum_ratio*100]*len(ax.get_xticks()), c = 'y') x_ids = np.where((amount_cumsum.values/all_amount_cumsum) <= trade_cum_ratio)[0] ax2.plot([ax2.get_xticks()[x_ids[-1]]]*2, [0,100], 'm--') ax.grid(True) ax2.grid(True) plt.show() if __name__ == '__main__': # read stock data path = './SH600000.feather' df = feather.read_dataframe(path) # feather.write_dataframe(df, output_path) start_date = "2016-09-30 14:30" end_date = "2016-09-30 15:00" valid_data = extract_valid_data_range(df, start_date, end_date) S = calculate_S(valid_data) # sort data by "S" -- descending S_sort_ids = np.argsort(S.values) S_arr = S.values[S_sort_ids[::-1]] sort_valid_data = valid_data.iloc[S_sort_ids[::-1]] # visualize Q visualize_S(sort_valid_data) Q = calculate_Q(valid_data, S)
df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw']) df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw']) df.columns.is_unique #True len(df.columns) #17 df.drop(columns, 1, inplace=True) #删除某些栏位,沿着列方向 len(df.columns) #14 # 大的中间结果最好保存起来 df.to_feather('{}df'.format(PATH)) #temp = df #df = pd.read_feather('{}df'.format(PATH)) # read_feather错误,改成一下方式 import feather df = feather.read_dataframe('{}df'.format(PATH)) type(df.Date) # 文件读取以后,是Series df['Date'] = pd.to_datetime(df.Date) type(df.Date) joined = join_df(joined, df, ['Store', 'Date']) joined_test = join_df(joined_test, df, ['Store', 'Date']) # 在移除某些行以后,再次重置索引 joined.reset_index(inplace=True) joined_test.reset_index(inplace=True) joined[:10] #再次保存 joined.to_feather('{}joined2'.format(PATH))
def main(): ''' Run ARD NMF''' torch.multiprocessing.set_start_method('spawn') parser = argparse.ArgumentParser( description= 'NMF with some sparsity penalty described https://arxiv.org/pdf/1111.6085.pdf' ) parser.add_argument('--data', help='Data Matrix', required=True) parser.add_argument('--feather', help='Input in feather format', required=False, default=False, action='store_true') parser.add_argument('--parquet', help='Input in parquet format', required=False, default=False, action='store_true') parser.add_argument('--K0', help='Initial K parameter', required=False, default=None, type=int) parser.add_argument('--max_iter', help='maximum iterations', required=False, default=10000, type=int) parser.add_argument('--del_', help='Early stop condition based on lambda change', required=False, default=1, type=int) parser.add_argument('--tolerance', help='Early stop condition based on max lambda entry', required=False, default=1e-6, type=float) parser.add_argument( '--phi', help='dispersion parameter see paper for discussion of choosing phi ' 'default = 1', required=False, default=1.0, type=float) parser.add_argument( '--a', help= 'Hyperparamter for lambda. We recommend trying various values of a. Smaller values' 'will result in sparser results a good starting point might be' 'a = log(F+N)', required=False, default=10.0, type=float) parser.add_argument( '--b', help= 'Hyperparamter for lambda. Default used is as recommended in Tan and Fevotte 2012', required=False, type=float, default=None) parser.add_argument( '--objective', help= 'Defines the data objective. Choose between "poisson" or "gaussian". Defaults to Poisson', required=False, default='poisson', type=str) parser.add_argument( '--prior_on_W', help='Prior on W matrix "L1" (exponential) or "L2" (half-normal)', required=False, default='L1', type=str) parser.add_argument( '--prior_on_H', help='Prior on H matrix "L1" (exponential) or "L2" (half-normal)', required=False, default='L1', type=str) parser.add_argument( '--output_dir', help= 'output_file_name if run in array mode this correspond to the output directory', required=True) parser.add_argument('--labeled', help='Input has row and column labels', required=False, default=False, action='store_true') parser.add_argument('--report_frequency', help='Number of iterations between progress reports', required=False, default=100, type=int) parser.add_argument('--dtype', help='Floating point accuracy', required=False, default='Float32', type=str) parser.add_argument( '--parameters_file', help= 'allows running many different configurations of the NMF method on a multi' 'GPU system. To run in this mode provide this argument with a text file with ' 'the following headers:(a,phi,b,prior_on_W,prior_on_H,Beta,label) label ' 'indicates the output stem of the results from each run.', required=False, default=None) args = parser.parse_args() print('Reading data frame from ' + args.data) if args.dtype == 'Float32': args.dtype = torch.float32 elif args.dtype == 'Float16': args.dtype = torch.float16 if args.parquet: dataset = pd.read_parquet(args.data) elif args.feather: print('loading feather...') dataset = feather.read_dataframe(args.data) else: if args.labeled: dataset = pd.read_csv(args.data, sep='\t', header=0, index_col=0) else: dataset = pd.read_csv(args.data, sep='\t', header=None) if args.objective.lower() == 'poisson': Beta = 1 elif args.objective.lower() == 'gaussian': Beta = 2 else: print('objective parameter should be one of "gaussian" or "poisson"') sys.exit() data = ARD_NMF(dataset, args.objective) if args.parameters_file != None: parameters = pd.read_csv(args.parameters_file, sep='\t') run_parameter_sweep(parameters, data, args, Beta) else: W, H, cost = run_method_engine(data, args.a, args.phi, args.b, Beta, args.prior_on_W, args.prior_on_H, args.K0, args.tolerance, args.max_iter) nsig = write_output(W, H, data.channel_names, data.sample_names, args.output_dir, args.output_dir)
return intersection_cardinality / float(x_cardinality) print(a.c.BOLD + 'Extracting set3d JSON features ...' + a.c.END) # Get train/test mode from launch argument mode = a.get_mode(sys.argv, '3_feature_set3d_json1.py') ## Read settings required by script config = a.read_config() nthreads = config.preprocessing_nthreads cache_loc = config.cache_loc debug = config.debug if mode == 0: root = config.train_images_root df = feather.read_dataframe(cache_loc + 'train.fthr') if mode == 1: root = config.test_images_root df = feather.read_dataframe(cache_loc + 'test.fthr') train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']] del df gc.collect() train = train.fillna('') ftrs = [] print('Calculating features ...') t0 = time.time() for i in range(0, len(train.index)):
import collections, itertools if __name__ == '__main__': import argparse, os, json parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--index', default="eurodata", help="index name") parser.add_argument('--meta', default="{}", help="metadata") parser.add_argument('paths', type=argparse.FileType()) args = parser.parse_args() es = Elasticsearch() meta = json.loads(args.meta) for fpath in args.paths: fpath = fpath.strip() df = feather.read_dataframe(fpath) df = df.where((pd.notnull(df)), None) # convert NaN to None name = os.path.basename(fpath) res = es.index(index=args.index, doc_type='schema', id=name, body={ 'schema': [s.decode('utf8', 'ignore') for s in df.columns], 'scrape_meta': meta }) try: it = ({ '_index': args.index, '_type': 'row', '_id': '%s-%s' % (name, i), '_source': {'row':[str(r).decode('utf8') for r in row]} } for i,row in df.iterrows()) print (name, '\t', sum(1 for _ in helpers.streaming_bulk(es, it)))
def main(**opt): # 准备工作 gc.enable() np.random.seed(123) # Get the optimized parameters n_folds = opt.pop('n_folds', 5) tag = opt.pop('tag', '') tmt = datetime.now().strftime('%Y%m%d_%H%M') tag += '_' + tmt + '_' clf_name = opt.get('model', 'GBMClassifier') tag += clf_name + '_' clf = getattr(models, clf_name)(opt) assert clf is not None # data directory cur_dir = op.dirname(__file__) data_dir = op.join(cur_dir, '../data') train_cache_file = op.join(data_dir, 'train_feat_cache.feather') test_cache_file = op.join(data_dir, 'test_feat_cache.feather') useless_feat_file = op.join(data_dir, '../stat/dump_feat.txt') useless_feat = load_useless_feat(useless_feat_file) # print(useless_feat) if op.exists(train_cache_file) and op.exists(test_cache_file): print("Loading train and test feathers cache file ...") train = feather.read_dataframe(train_cache_file) test = feather.read_dataframe(test_cache_file) else: train, test = create_features(data_dir, useless_feat) train, y = train.iloc[:, :-1], train['TARGET'] subm = test[['SK_ID_CURR']] print("Feature added train shape: {}".format(train.shape)) train = exclude_column_df(train, useless_feat) test = exclude_column_df(test, useless_feat) if clf_name in ['RFClassifier', 'ETClassifier', 'XGB_Classifier']: print("One hot encoding variables ...") train_size = train.shape[0] data = pd.concat([train, test]) del train, test obj_cols = [ c for c in data.columns.tolist()[1:] if data[c].dtype == 'object' or data[c].dtype.name == 'category' ] # print(obj_cols) not_obj_cols = [c for c in data.columns.tolist() if c not in obj_cols] one_hot_data = pd.get_dummies(data[obj_cols]) # print(one_hot_data.shape, type(one_hot_data)) data = pd.concat([data[not_obj_cols], one_hot_data], axis=1) data = exclude_column_df(data, useless_feat) test = data.iloc[train_size:, :].reset_index(drop=True) train = data.iloc[:train_size, :].reset_index(drop=True) del data print("Encoding done!") # may do some tweak using feature importance feat_selected = train.columns.tolist()[1:] print("Used features count: {}".format(len(feat_selected))) # do stacking. print("Begin to do cross validation to model data ...") cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=521) train_pred = np.zeros(train.shape[0]) test_pred = np.zeros((test.shape[0], n_folds)) feat_imp = pd.DataFrame(np.zeros((len(feat_selected), n_folds))) feat_imp['features'] = feat_selected for k, (trn_idx, val_idx) in enumerate(cv.split(train, y)): trn_x, trn_y = train[feat_selected].iloc[trn_idx], y.iloc[trn_idx] val_x, val_y = train[feat_selected].iloc[val_idx], y.iloc[val_idx] clf.fit(train_set=(trn_x, trn_y), valid_set=(val_x, val_y)) train_pred[val_idx] = clf.predict_proba(val_x) test_pred[:, k] = clf.predict_proba(test[feat_selected]) stat = roc_auc_score(val_y, train_pred[val_idx]) print("K={}, AUC: {:.3f}".format(k + 1, stat)) # collect importance info feat_imp.iloc[:, k] = clf.get_feat_imp() total_auc = roc_auc_score(y, train_pred) print("CV-{} had been done! Total train auc is: {:.4f}".format( n_folds, total_auc)) feat_imp['imp_mean'] = feat_imp.iloc[:, :n_folds].mean(axis=1) feat_imp['imp_std'] = feat_imp.iloc[:, :n_folds].std(axis=1) feat_imp['imp_cv'] = feat_imp['imp_std'] / feat_imp['imp_mean'] feat_imp = feat_imp.iloc[:, n_folds:].sort_values('imp_cv', ascending=True, na_position='last') ind1 = feat_imp['imp_cv'].isnull() ind2 = feat_imp['imp_cv'] > 0.5 ind3 = feat_imp['imp_mean'] < 10 ind = ind1 | (ind2 & ind3) feat_imp['should_filter'] = ind.astype('int') # save to files tag += 'kfold_{}_auc_{:.4f}_'.format(n_folds, total_auc) print("Begin to save statistic into files") stat_dir = op.join(cur_dir, '../stat') feat_imp_file = op.join(stat_dir, tag + 'feat_imp.csv') feat_imp.to_csv(feat_imp_file, index=False) train_pred_ = train[['SK_ID_CURR']] train_pred_.loc[:, 'TARGET_PRED'] = train_pred train_pred_file = op.join(stat_dir, tag + 'train_cv_pred.csv') train_pred_.to_csv(train_pred_file, index=False) print("Saving test prediction to files ...") subm['TARGET'] = np.mean(test_pred, axis=1) subm_file = op.join(cur_dir, '../sub', tag + 'subm.csv.gz') subm.to_csv(subm_file, index=False, compression='gzip') print("All prediction done!")
import feather import numpy as np import pandas as pd #Import feather files that were downloaded and saved using TCGA2STAT package in R: gene_counts = feather.read_dataframe('Gene_counts.feather') #Check data import: if np.isfinite(gene_counts.shape[0]) : print("Gene Counts data set imported!") else : print("Error in Gene_counts import") """Gene Counts dataframe are formatted with patient ID as the index and gene names as columns. Each value represents the abundance estimate for the gene in each particular RNA-seq run. This value is 'raw' in that it is not normalized by the total number of reads made for the sample. Thus the first step in data clean-up is to transform these values to be normalized by the total number of reads for the sample (in Millions).""" gene_counts.set_index(['gc_index'], inplace = True) # set the index as the TCGA ID codes #print(gene_counts.index[1:5]) [Debug] print("\nDimension of DataFrame:", gene_counts.shape,"\n") def transformation(dataset) : read_count = dataset.sum(axis = 1) #get the total reads for each sample for r in range(0,dataset.shape[0]) : dataset.iloc[r] = 1000000 * dataset.iloc[r] / read_count.iloc[r] #transform each read abundance (rsem) by the sample reads / million if sum(round(dataset.sum(axis = 1)) == 1e6) == dataset.shape[0] : #the sum of each row in the transformed df should be 1000000. if every row is transformed correctly, print statement print("Transformation Successful!\n") print(dataset.shape[0],'Gene count estimate profiles have been transformed from transcript abundance estimates to transcripts per million reads (TPM)')
def read_df(filename, index_col='date'): import feather return feather.read_dataframe(filename).set_index(index_col)
zd1 = hos_dic.ix[0,:] ttl_fee.head(10) ttl_fee['hosname'] = ttl_fee['x5'].map(hos_dic15) hos_dic = pd.read_hdf("/mnt/e/pyr/data/hdf5/R_fee_15.h5", 'hos_dic') hos_dic.columns = ['code','name'] hos_dic15 = dict(zip(hos_dic.code,hos_dic.name)) ttlfee = ft.read_dataframe( '/mnt/e/pyr/data/y2015/2015x/2015_x229.pyr') veri = ft.read_dataframe( '/mnt/e/pyr/data/y2015/2015x/2015_x262.pyr') nm = ft.read_dataframe( '/mnt/e/pyr/data/y2015/2015x/2015_x32.pyr') gender = ft.read_dataframe( '/mnt/e/pyr/data/y2015/2015x/2015_x33.pyr') birthdate = ft.read_dataframe( '/mnt/e/pyr/data/y2015/2015x/2015_x34.pyr') record = ft.read_dataframe(
def read_feather_dask(filepath): df = feather.read_dataframe(filepath, columns=p.columns) return dd.from_pandas(df, npartitions=p.n_workers)
def export_data_set(name = None): if name is None: data = feather.read_dataframe(join(out_path, "iris.data")) else: data = feather.read_dataframe(join(out_path, name)) return data
# input directory indir = "/disks/shear15/ssli/KV450/split/all_tomo" # input postfix inP_r = "_T_B_less3" inP_b = "_T_B_greater3" inPs = [inP_r, inP_b] area = 341.3 * 3600. # 1/arcmin^2 outdir = "/disks/shear15/ssli/CosmicShear/covariance/prepare/Ndensity_sigmae" # output postfix outPs = ["_red", "_blue"] for k in range(len(inPs)): WorA = 'w' for i in range(5): inpath = indir + str(i+1) + inPs[k] + '.feather' indata = feather.read_dataframe(inpath) outpath = outdir + outPs[k] + '.txt' id_zbin = i + 1 e1 = indata['bias_corrected_e1'] e2 = indata['bias_corrected_e2'] wg = indata['recal_weight'] NeffSigmaeFunc(id_zbin, e1, e2, wg, area, outpath, WorA) WorA = 'a' print("Finished in", id_zbin, inPs[k])
from pandas.util.testing import assert_frame_equal import pandas as pd import feather import uuid nrows = 4000000 ncols = 100 data = np.random.randn(nrows) df = pd.DataFrame({'c{0}'.format(i): data for i in range(ncols)}) def guid(): return uuid.uuid4().hex path = 'test_{0}.feather'.format(guid()) try: feather.write_dataframe(df, path) df2 = feather.read_dataframe(path) assert_frame_equal(df, df2) finally: try: os.remove(path) except os.error: pass
import argparse import feather import magic #import os parser = argparse.ArgumentParser(description='wrapper for magic') parser.add_argument('--matx', dest='matx', help='Matx path') parser.add_argument('--out', dest='out', help='Output path') #parser.add_argument('--maxCellSize', dest='maxCS',type=int,default=1000000,help='Max num of reads allow in a cell') #parser.add_argument('--minCellSize', dest='minCS',type=int,default=1,help='Min num of reads allow in a cell') args = parser.parse_args() # Load single-cell RNA-seq data df = feather.read_dataframe(args.matx) scdata = magic.mg.SCData(df, 'sc-seq') # MAGIC scdata.run_magic(n_pca_components=15, random_pca=True, t=6, k=30, ka=10, epsilon=1, rescale_percent=99) #output feather.write_dataframe(scdata.magic.data, args.out)
def fix_test(df): """ test has missing values for ci and co. ci is, on average, 35 days after date_Time co is, on average, 2.4 days after ci """ df.date_time = pd.to_datetime(df.date_time, errors='coerce') df.srch_ci = pd.to_datetime(df.srch_ci, errors='coerce') df.srch_co = pd.to_datetime(df.srch_co, errors='coerce') df.srch_ci = df.srch_ci.fillna(df.date_time + timedelta(days=35)) df.srch_co = df.srch_co.fillna(df.srch_ci + timedelta(days=2)) return df print(78*'=') print("Reading train...") df_train = feather.read_dataframe('../data/train_only_booked.feather') print("Creating Features for Train...") df_train_features = create_features(df_train, train=True) print("Writing Feather...") feather.write_dataframe(df_train_features, '../data/train_only_booked_features.feather') gc.collect() print(78*'=') print("Reading holdout...") df_hold = feather.read_dataframe('../data/holdout.feather') print("Munging Holdout") df_hold_feat = create_features(df_hold) print("Writing Feather...") feather.write_dataframe(df_hold_feat, '../data/holdout_features.feather') gc.collect()
def from_uri(cls, uri: str, source: Optional[DataObject] = None, **kwargs) -> "FeatherFile": data = feather.read_dataframe(uri) result = cls(inner_data=data, uri=uri, source=source, **kwargs) return result
] plot_excl = ["BARW_8"] # The palette with black: cbbPalette = [ "#000000", "#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7" ] site_data = pd.read_excel( "/mnt/win/UMoncton/OneDrive - Université de Moncton/Data/sites_deployment_2018.xlsx" ) # site_data = pd.read_excel( # "C:\\UMoncton\\Doctorat\\data\\datasheet\\2018\\sites_deployment_2018.xlsx") # aci = feather.read_dataframe("src/plots/ACI.feather") aci = feather.read_dataframe("data/ACI.feather") print(aci) # aci.date = aci.date.dt.tz_localize("UTC") aci = aci.loc[aci.site.isin(sites)] aci = aci.loc[~aci["plot"].isin(plot_excl)] aci["julian"] = aci["date"].dt.dayofyear aci["hour"] = aci["date"].dt.hour aci = aci.sort_values(["site", "plot", "julian", "date"]) aci = aci.loc[(aci["julian"] > 155) & (aci["julian"] < 220)] aci = aci.loc[(aci["ACI"] < 50000)] aci = aci.loc[aci["denoised"] == False] print(aci.loc[aci["plot"] == "EABA_1"]) aci = aci.reset_index() res = aci.groupby(["plot"], as_index=False).apply(check_dates, site_data) print(res)
def load_sample_data(sample, filename): tbl = pd.read_csv(filename, index_col=0) tbl.columns = ['{}:{}'.format(sample, c) for c in tbl.columns] return tbl def column_sortkey(name): sample, c = name.split(':', 1) return '{}:{}'.format(GENELEVEL_STATS_COLUMN_ORDER.get(c, c), sample) # Load all tables samples = sm.params.samples_by_genome[sm.wildcards.genome] alldatatbl = pd.concat(list(map(load_sample_data, samples, sm.input)), axis=1) # Merge selected columns from gene annotations genes = feather.read_dataframe(os.path.join(sm.params.genomedir, 'annotations-gene.feather')) for alias in GENETYPE_ALIASES: if alias in genes.columns: genes['gene_type'] = genes[alias] genes = genes.set_index('gene_id')[GENEINFO_COLUMNS] # Determine orders of columns and rows cols_order = GENEINFO_COLUMNS + sorted(alldatatbl.columns, key=column_sortkey) alldatatbl['__total_tags'] = alldatatbl[[c for c in alldatatbl.columns if c.endswith(':polyA_tag_count')]].sum(axis=1) finaltbl = pd.merge(genes, alldatatbl, how='right', left_index=True, right_index=True).sort_values( by='__total_tags', ascending=False)[cols_order] del alldatatbl, genes # Write out in the csv format
# -*- coding: utf-8 -*- import feather as ft import pandas as pd import pickle daily = ft.read_dataframe(r'E:\marketData.feather') daily.head() daily_2017 = daily[daily['date'] >= '2017-01-01'] daily_2017 = daily_2017[['date', 'symbol', 'close', 'preClose']] daily_2017['daily_return'] = (daily_2017['close'] - daily_2017['preClose'] ) * 100 / daily_2017['preClose'] output = open(r'C:\Users\wuwangchuxin\Desktop\dailyreturn.pickle', 'wb') pickle.dump(daily_2017, output) output.close()
#print(datetime.datetime.now()) #### val #path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_val_fold3_blind1_trX.feather' #train_x = feather.read_dataframe(path) # # #path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_val_fold3_blind1_trY.feather' #train_y = feather.read_dataframe(path) # #path_write = 'D:\\workspace_R\\thalas\\20180427\\result\\chi\\chi_val_fold3_blind1_k.csv' ### val #### test path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_trainX_test5.feather' train_x = feather.read_dataframe(path) path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_trainY_test5.feather' train_y = feather.read_dataframe(path) path_write = 'D:\\workspace_R\\thalas\\20180427\\result\\chi\\chi_test_fold5.csv' ### test chi2, pval = chi2(X=train_x,y=train_y) x2 = np.asarray(pval) #np.savetxt("fold1_score.csv", x2, delimiter=",") np.savetxt(path_write, x2, delimiter=",")
def read_dataframe(fn): df = feather.read_dataframe(fn) df.index = df.iloc[:, 0] df = df.iloc[:, 1:] return df
""" # Prepare environment and load and prepare data ----------------------------- import pandas as pd import nltk import feather import re, math from collections import Counter #from sklearn.feature_extraction.text import TfidfVectorizer # Feather formatted dataframes import directly into pandas dataframes # New module/package collaboration for easy trasnfer R <--> python event_dict = feather.read_dataframe('../parsed_data/parsed_dict.feather') email_df = feather.read_dataframe('../parsed_data/simplified_email.feather') ## Prepare event dictionary dataframe event_dict.drop(['NA'], inplace=True, axis=1) event_dict = event_dict.transpose() event_dict.columns = event_dict.loc['event'] event_dict = event_dict.reindex(event_dict.index.drop(['event'])) # remove occurrences of "x...#" stemming from wikipedia using regular expression event_dict = event_dict.applymap(lambda z: re.sub(r'(^|\s)x(\w+,)', r'', z)) ## Prepare email dataframe # Convert string in email_raw column to list of strings #email_df.email_raw = email_df.email_raw.apply(lambda x: x.split(","))
#output_variables cell_score_output = snakemake.output["cell_score"] gene_score_output = snakemake.output["gene_score"] maximum_overlap_output = snakemake.output["maximum_overlaps"] ranked_genes_output = snakemake.output["ranked_genes"] model_output = snakemake.output["model_output"] metrics_output = snakemake.output["metrics_output"] #params seed = int(snakemake.wildcards["seed"]) k = int(snakemake.wildcards["k"]) n_trials = int(snakemake.params["n_trials"]) #set seed np.random.seed(seed) #read file df = feather.read_dataframe(counts_input) #start analysis sparse_arr = scipy.sparse.coo_matrix(df.to_numpy()) model = hpf.run_trials(sparse_arr, nfactors=k, ntrials=n_trials, validation_data=None) metrics = pd.DataFrame() metrics["loss"] = model.loss[-10:] metrics["k"] = np.repeat(k, 10) metrics["seed"] = np.repeat(seed, 10) cell_score = model.cell_score() gene_score = model.gene_score() table = hpf.max_pairwise_table(gene_score,
from dataprocess import data import featureengineer from featureengineer import persona_features import pandas as pd import feather import gc appsnum, tags_nums = featureengineer.appsnum, featureengineer.tags_nums lgbOut_Features = persona_features.main() afterExpand_df_path = persona_features.afterExpand_df_path noExpand_dfPath = data.noExpand_dfPath data = feather.read_dataframe(afterExpand_df_path) persona_df = pd.read_pickle(noExpand_dfPath) data = data.merge(persona_df, how='left', on='guid') del persona_df gc.collect() sparse_features = [ 'app_version', 'guid', 'netmodel', 'newsid', 'geohash', 'ts_hour', 'device_info', 'gender' ] dense_features = [ 'pos', 'level', 'personidentification', 'followscore', 'personalscore' ] var_features = ['applist', 'new_tag']