def createDatarankWebCorpus(wordcloud=False,num_threads=15): path='/home/arya/PubMed/GEO/Datasets/' dpp=pd.read_pickle(path+'DPP.df')[['accession','pmid','cites_pmid']].drop_duplicates() dpp d=pd.read_pickle(path+'D.All.df') d count=pd.read_pickle(path+'DCC.df')[['accession','cpcc']] m=pd.read_pickle(path+'M.df') pm=pd.read_pickle(path+'PM.df') pm.index=pm.pmid pm=pm.loc[dpp.cites_pmid.unique()] pm=pd.merge(pm,m,left_on='muid',right_on='uid')[['pmid','name']] pmg=pm.groupby('pmid') CP=pm.pmid.unique() pm=pd.DataFrame( map(lambda p: (p, pmg.get_group(p).name.unique().tolist()) , CP), columns=['pmid','mesh']) dppm=pd.merge(dpp,pm,left_on='cites_pmid',right_on='pmid') G=dppm.groupby('accession') A=dppm.accession.unique() dm=pd.DataFrame([(a,[m for v in G.get_group(a).mesh.values for m in v]) for a in A], columns=['accession','mesh']) if wordcloud: # map(word_cloud ,dm.iterrows()) from multiprocessing import Pool pool=Pool(num_threads) pool.map(word_cloud ,dm.iterrows()) d=pd.merge(dm,d,on='accession') d=pd.merge(d, count, on='accession') d.to_pickle(path+'D.Web.df')
def main(): """Main function to initialize databases to analysize Yelp data.""" import random # ------------ Save Yelp Data as Pandas DataFrames to pickle ------------ # Save all Yelp restaurant data in Arizona (Phoenix area) #restaurant_data = read_yelp('business',state=['AZ'],open=[True],categories='restaurants') #review_data = read_yelp('review',business_id=restaurant_data.business_id.unique()) restaurant_data = pd.read_pickle('../data/pandas/business.pkl') review_data = pd.read_pickle('../data/pandas/review.pkl') result = save2pickle(restaurant_data,review_data) result = py2mysql(restaurant_data,review_data) # Save information for mexican restaurants only restaurant_data = restaurant_data[restaurant_data['categories'].map(lambda x: 'mexican' in [cat.lower() for cat in x])] review_data = review_data[review_data['business_id'].isin(restaurant_data.business_id.unique())] result = save2pickle(restaurant_data,review_data,append_string='_mexican') result = py2mysql(restaurant_data,review_data,append_string='_mexican') # Segment some data for training random.seed(1234) trainids = random.sample(restaurant_data.business_id,20) restaurant_data = restaurant_data[restaurant_data['business_id'].isin(trainids)] review_data = review_data[review_data['business_id'].isin(trainids)] result = save2pickle(restaurant_data,review_data,append_string='_mexican_train') # Make database of individual sentences from review data sentences = process_text.reviews_to_sentences(review_data) sentences = process_text.add_training_label(sentences,review_data) sentences.to_pickle('../data/pandas/sentences_mexican.pkl') result = sentence2mysql(sentences,review_data,append_string='_mexican')
def loadFromDb(identificador): import sys if not sys.version_info[:2] == (3, 4): print ('Sos un boludo!, pero uno previsor') print ('Este codigo esta pensado para correr en python 3.4') import os import pandas as pd from IPython.display import display display ('Verificando datos guardados previamente') filenameTouchs = 'dbTouchs' filenameSounds = 'dbSounds' if os.path.isfile(filenameTouchs): touchsLoad = pd.read_pickle(filenameTouchs) display ('Datos de touchs previos cargados') else: display ('Datos de touchs previos inexistentes') touchsLoad = pd.DataFrame() if os.path.isfile(filenameSounds): soundsLoad = pd.read_pickle(filenameSounds) display ('Datos de sounds previos cargados') else: display ('Datos de sounds previos inexistentes') soundsLoad = pd.DataFrame() if identificador!=0: touchsLoad = touchsLoad[touchsLoad['identificador']==identificador] soundsLoad = soundsLoad[soundsLoad['identificador']==identificador] return touchsLoad, soundsLoad
def test_parse_rna_seq_metrics(self): metrics, hist = cpb.picard.parse_rna_seq_metrics( add_root('rna_seq_metrics.txt')) metrics2 = pd.read_pickle(add_root('rna_seq_metrics_metrics.pickle')) hist2 = pd.read_pickle(add_root('rna_seq_metrics_hist.pickle')) assert_series_equal(metrics, metrics2) assert_series_equal(hist, hist2)
def get_subjects_list_adults_fct(df_path, df_qc_path, subjects_list): ''' excludes kids and subjects with missing sex or age ''' import pandas as pd import numpy as np df = pd.read_pickle(df_path) df_qc = pd.read_pickle(df_qc_path) df = pd.merge(df, df_qc, left_index=True, right_index=True) pd.to_pickle(df, 'testdf.pkl') df['subject_id'] = df.subject_id_x # fixme exclude subjects with mean_FD>.1 subjects_list_exclude = df[(df.age<18) | (df.mean_FD_Power>.1)].index subjects_list_adults = subjects_list for exclude_subject in subjects_list_exclude: if exclude_subject in subjects_list_adults: subjects_list_adults.remove(exclude_subject) missing_info = df[(df.age==999) | ((np.logical_or(df.sex=='M', df.sex=='F'))==False)].index for missing in missing_info: if missing in subjects_list_adults: subjects_list_adults.remove(missing) # remove subject from subject_list_adults for which no entry exists in df for subject in subjects_list_adults: if not(subject in df.index): subjects_list_adults.remove(subject) return subjects_list_adults
def plotPowerCLR(recompute=False): if recompute: mc = pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'MarkovChain')) hmm = f(pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'HMM'))) a = pd.concat([mc, hmm]); print a a = a[a.index.get_level_values('coverage') != np.inf] df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(Qcoverage[x.name[0]])].mean()))[0] # df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(0.99)].mean())) df = getPower(df, groupbyLevels=range(4)) df.to_pickle(utl.outpath + 'ROC/PowerCLR.df') else: df = pd.read_pickle(utl.outpath + 'ROC/PowerCLR.df') reload(pplt) info = pplt.getNameColorMarker(df) info.loc[info.index.get_level_values('method') == 'HMM', 'marker'] = '--o' info.loc[info.index.get_level_values('method') == 'MarkovChain', 'marker'] = '--s' info.loc[info.index.get_level_values('method') == 'HMM', 'color'] = 'r' info.loc[info.index.get_level_values('method') == 'MarkovChain', 'color'] = 'darkblue' # info.loc[info.index.get_level_values('q')==0.99,'color']='r' # info.loc[info.index.get_level_values('q')==1,'color']='darkblue' fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=dpi); pplt.setStyle(lw=1); pplt.plotOnePower(df.xs(0.005, level='nu0'), info, axes[0], legendSubplot=0, ylabel='Hard'); pplt.plotOnePower(df.xs(0.1, level='nu0'), info, axes[1], ylabel='Soft'); [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))] plt.gcf().subplots_adjust(bottom=0.15) pplt.savefig('powerCLR', dpi=dpi) plt.show()
def train_model(): global df_rest_p1 global df_rest_p2 global df_rest_p3 print "training initial..." person = request.form.get('person') df=pd.DataFrame() if person=='Person1': df = pd.read_pickle("data_3/person2_weather") df, df_demo=splitTrainingset_for_demo(df,0.8) df_rest_p1 =df_demo elif person=='Person2': df = pd.read_pickle("data_3/person3_weather") df, df_demo = splitTrainingset_for_demo(df, 0.8) df_rest_p2 = df_demo elif person=='Person3': df = pd.read_pickle("data_3/person4_weather") df, df_demo = splitTrainingset_for_demo(df, 0.8) df_rest_p3 = df_demo X,y=data_preparation(df) json_plot = train_1_sample_batches_predict_next_sample(X,y, person) json_data = json.dumps(json_plot) return json_data
def pickling(): z=pd.read_pickle('z.pkl') y=pd.read_pickle('y.pkl') fi=pd.read_pickle('fi.pkl') em=pd.read_pickle('em.pkl') em_y=pd.read_pickle('em_y.pkl') return (z,y,fi,em,em_y)
def plotSurf(): from scipy import interpolate a=pd.read_pickle(utl.outpath+'real/real.maxLikelihoods.df') idx=(a.s.abs()*a.h.abs()*(a.alt-a.null)).sort_values().index[-1] R=pd.DataFrame(pd.read_pickle(utl.outpath+'real/real.replicates.df').loc[idx]).T SH=dta.getSH() ARGS=[(R,)+sh for sh in SH] likelihoods=pd.concat(map(mkv.computeLikelihoodReal,ARGS),axis=1);likelihoods.columns.names=['s','h'] fig = plt.figure() ax = fig.gca(projection='3d') df=pd.concat([pd.Series(z[1].loc[z[0]].values,index=z[1].loc[z[0]].index,name=z[0]) for z in b.groupby(level=0)],axis=1) Z=df.values # Z[Z==Z.min()]=-1e3 X=np.tile(df.index.values[:,None],Z.shape[1]) Y=np.tile(df.columns.values[:,None],Z.shape[0]).T Z.min() Z.max() nn = 401; xi = np.linspace(-1.0, 2.0, 10); yi = np.linspace(-0.5, 0.5, nn); f = interpolate.interp2d(X,Y,Z,kind='cubic') zi = f(xi, yi) [xi, yi] = np.meshgrid(xi, yi); # surf = ax.plot_surface(X, Y, Z, cmap=mpl.cm.autumn) surf = ax.plot_surface(xi, yi, zi, cmap=mpl.cm.autumn) fig.colorbar(surf, shrink=0.5, aspect=5) # surf(xi, yi, zi, 'LineStyle', 'none', 'FaceColor', 'interp') plt.show()
def helpfulModelingPipelineRFC(): print "Loading pickles..." #comments_discussion_df=pd.read_pickle('comments_discussion.p') X=pd.read_pickle('X.p') y_actual=pd.read_pickle('y_actual.p') X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.15, random_state=0) print y_actual_train.head() #pca = PCA(n_components=1) #use only SelectKBest to select features selection = SelectKBest(f_classif,k=15) X_features = selection.fit(X_train.iloc[:,0:len(X.columns)-2], y_actual_train).transform(X_train.iloc[:,0:len(X_train.columns)-2]) rfc = RandomForestClassifier(criterion='entropy') # Do grid search over k, n_components and C: pipeline = Pipeline([('feature_selection', selection), ('rfc', rfc)]) param_grid = dict(feature_selection__k=[11,13,14,15,16], rfc__n_estimators=[950,1000,1050], rfc__max_depth = [13,14,15,16], rfc__min_samples_split = [4,5,6,7], rfc__min_samples_leaf = [1,2,3]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='precision', cv=20 ,verbose=10,n_jobs=15) grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train['is_helpful'].values) print(grid_search.best_estimator_) #print "All columns:"+str(X.columns) #print "Just the selected columns:"+str(X.columns[pipeline.named_steps['selection'].get_support()]) pickle.dump(grid_search.best_estimator_, open( "rfc_best_estimator.p", "wb" ) )
def svd_training(params): """ Train Surprise SVD using the given hyper-parameters """ logger.debug("Start training...") train_data = pd.read_pickle(path=os.path.join(params['datastore'], params['train_datapath'])) validation_data = pd.read_pickle(path=os.path.join(params['datastore'], params['validation_datapath'])) svd_params = {p: params[p] for p in ['random_state', 'n_epochs', 'verbose', 'biased', 'n_factors', 'init_mean', 'init_std_dev', 'lr_all', 'reg_all', 'lr_bu', 'lr_bi', 'lr_pu', 'lr_qi', 'reg_bu', 'reg_bi', 'reg_pu', 'reg_qi']} svd = surprise.SVD(**svd_params) train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(params['surprise_reader'])) \ .build_full_trainset() svd.fit(train_set) logger.debug("Evaluating...") metrics_dict = {} rating_metrics = params['rating_metrics'] if len(rating_metrics) > 0: predictions = compute_rating_predictions(svd, validation_data, usercol=params['usercol'], itemcol=params['itemcol']) for metric in rating_metrics: result = getattr(evaluation, metric)(validation_data, predictions) logger.debug("%s = %g", metric, result) if metric == params['primary_metric']: metrics_dict['default'] = result else: metrics_dict[metric] = result ranking_metrics = params['ranking_metrics'] if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions(svd, train_data, usercol=params['usercol'], itemcol=params['itemcol'], recommend_seen=params['recommend_seen']) k = params['k'] for metric in ranking_metrics: result = getattr(evaluation, metric)(validation_data, all_predictions, col_prediction='prediction', k=k) logger.debug("%s@%d = %g", metric, k, result) if metric == params['primary_metric']: metrics_dict['default'] = result else: metrics_dict[metric] = result if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") # Report the metrics nni.report_final_result(metrics_dict) # Save the metrics in a JSON file output_dir = os.environ.get('NNI_OUTPUT_DIR') with open(os.path.join(output_dir, 'metrics.json'), 'w') as fp: temp_dict = metrics_dict.copy() temp_dict[params['primary_metric']] = temp_dict.pop('default') json.dump(temp_dict, fp) return svd
def __init__(self, psr, loadvec=True): self.log = logging.getLogger('Source') self.log.debug('Initializing source.') self.psr = psr self.npsrs = 1 self.path = paths.vectors + 'srcVec_' + psr # If necessary, rebuild catalogue; otherwise, just load catalogue. self.log.debug('Looking for catalogue.') try: f = open(paths.textfromATNF, 'r') pd.read_pickle(paths.psrcat) except IOError: self.log.warning('No PSR catalogue found.') self.build_catalogue() f = open(paths.textfromATNF, 'r') finally: self.log.debug('Checking catalogue.') f_text = f.read() f.close() if self.psr not in f_text: self.log.debug('PSR not in catalogue.') self.build_catalogue() self.log.debug('Reading catalogue.') psrcat = pd.read_pickle(paths.psrcat) self.param = psrcat.ix[self.psr] if loadvec: self.loadVectors()
def helpfulModelingPipelineGBC(): #load the pickles print "Loading pickle..." X=pd.read_pickle('X.p') y_actual=pd.read_pickle('y_actual.p') print "X head without the body and the comment_id:" print X.iloc[:,0:len(X.columns)-2].head() print "y_actual:" print y_actual['is_helpful'].values X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual['is_helpful'].values, test_size=0.15, random_state=0) selection = SelectKBest(f_classif,k=15) X_features = selection.fit_transform(X_train.iloc[:,0:len(X.columns)-2], y_actual_train) gbc = GradientBoostingClassifier(n_estimators=200) print np.unique(X_train.iloc[:,5:6]) #Create a pipeline of feature selection and gradient boosting classifier pipeline = Pipeline([('feature_selection',selection),('gbc',gbc)]) param_grid = dict(feature_selection__k=[9,10,11,12,14], gbc__n_estimators = [450,500,550], gbc__max_depth = [33,35,40], gbc__min_samples_split = [1,2,3], gbc__min_samples_leaf = [2,3,4]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='recall',cv=15,verbose=10,n_jobs=15) grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train) print(grid_search.best_estimator_) print "Just the selected columns:"+str(X.iloc[:,0:len(X.columns)-2].columns[pipeline.named_steps['feature_selection'].get_support()]) pickle.dump(grid_search.best_estimator_, open( "gbc_best_estimator.p", "wb" ) )
def train_model(): """ Train a scikit-learn model on top of the imagefeatures """ print "Loading data into memory..." train = pd.read_pickle(file_path('train-features')) test = pd.read_pickle(file_path('test-features'))
def sample_ratings_large(): df_titles = pd.read_pickle('df_titles_condensed.obj') df_ratings = pd.read_pickle('df_ratings_condensed.obj') df_titles.dropna(how='any', subset=['plot', 'storyline', 'genre', 'years'], inplace=True) valid_ids = set(df_titles['movie_id']) df_ratings = df_ratings[df_ratings['movie_id'].isin(valid_ids)] def condense(df_titles, df_ratings, title_ratings, user_ratings=20): valid_ids = set(df_titles['movie_id']) df_ratings = df_ratings[df_ratings['movie_id'].isin(valid_ids)] old_shape = (0, 0) titles_to_keep = 0 while old_shape != df_ratings.shape: print(df_titles.shape) old_shape = df_ratings.shape agg = df_ratings.groupby('movie_id').count() titles_to_keep = set(agg[agg['user_id'] > title_ratings].index) agg = df_ratings.groupby('user_id').count() users_to_keep = set(agg[agg['movie_id'] > user_ratings].index) df_ratings = df_ratings[df_ratings['movie_id'].isin(titles_to_keep)] df_ratings = df_ratings[df_ratings['user_id'].isin(users_to_keep)] df_titles = df_titles[df_titles['movie_id'].isin(titles_to_keep)] print('%d/%d: found %d titles with %d ratings' % (user_ratings, title_ratings, len(titles_to_keep), df_ratings.shape[0])) df_ratings.to_pickle('df_ratings_condensed_2.obj') df_titles.to_pickle('df_titles_condensed_2.obj') pdb.set_trace()
def read(param): """ data is sorted first by Chrom and then POS in addGlobalPos. Important to have them sorted together """ try: meta=pd.read_pickle(param['dspath']+param['dsname']+'.meta.df') snp=pd.read_pickle(param['dspath']+param['dsname']+'.snp.df') except: if param['Region']=='Peru' and param['dsname']=='all': meta= Data.readPeruAll() elif param['Region']=='Peru' and param['dsname']=='winzeler': meta= Data.readPeruFiltered() elif param['Region']=='Sudan': meta= Data.readSudan() else: print >> sys.stderr, 'Bad Parameter: ',param exit() meta= Data.removeNonPolymorphicandTriAllele(meta, param) meta = Data.correctCall(meta, param) meta= Data.computeRC(meta, param) meta.ix[:,'hetero']= meta[param['names']].apply(lambda x: ((x=='0/1')|(x=='1/0')).sum(),axis=1) meta=pd.concat([meta, meta[param['names']].apply(lambda x: x.value_counts(),axis=1).fillna(0)],axis=1) meta['0/1']+=meta['1/0'];meta.drop(['1/0'],axis=1,inplace=True) calls=meta[param['names']] snp=pd.concat([pd.DataFrame(calls.applymap(lambda x: x.split('/')[0]).values, columns=calls.columns+'maj') , pd.DataFrame(calls.applymap(lambda x: x.split('/')[1]).values, columns=calls.columns+'min')],axis=1).astype(int).T.sort_index();snp.columns=calls.index.values #major is always zero in heterozygotes in the other getsnp function 1/0 is possible for example line 7 mdio08 in the xlsx from popgen.Plasmodium.Run import runHW meta=runHW(param,meta) meta.to_pickle(param['dspath']+param['dsname']+'.meta.df') snp.to_pickle(param['dspath']+param['dsname']+'.snp.df') return snp,meta
def transform_data(): """ Passes the downloaded data through the indico imagefeatures API """ train = pd.read_pickle(file_path('train')) test = pd.read_pickle(file_path('test')) # limit to the first 10000 training examples train = train[:10000] train.name, test.name = 'train', 'test' for df in (train, test): imagefeatures = [] i = 0 batch_size = 50 n = len(df.data)/batch_size print "Fetching %s imagefeatures..." % (df.name) for df_batch in batch(df.data, batch_size): print "\t%d/%d" % (i, n) imagefeatures.extend(batch_image_features(df_batch)) i += 1 df['features'] = imagefeatures df.to_pickle(file_path("cifar10-%s-features.pkl" % df.name))
def my_form_post(answer=None): filename='/home/seonhoon/Desktop/workspace/ImageQA/data/dict.pkl' with open(filename, 'rb') as fp: idx2word, word2idx, idx2answer, answer2idx = cPickle.load(fp) text = request.form['text'] print text question=text.split() q_idx=[] for i in range(len(question)): q_idx.append(word2idx[question[i]]) q_idx=np.array(q_idx) print q_idx #running caffe and tensorflow seems not so easy simultaneously pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA_Web/cnn.pkl') x_img = np.array([pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA_Web/cnn.pkl')['cnn_feature'][0].tolist()]) x , x_mask = prepare_data([q_idx], config.steps) y = test_sample(x, x_mask, x_img) print idx2answer[y[0]] params = {'answer' : idx2answer[y[0]], 'text' : text} return render_template('iqa.html', **params)
def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1): if CD is None: CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:] if E is None: E = pd.read_pickle(utl.outpath + 'real/Emissions.df') likes_null = getNullLikelihoods(CD,E) likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h)) likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h)); neg = likes_thn[likes_null <= likes_thn]; zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index]; pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index]; if verbose>0: print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size, zero.size / float(CD.shape[0]) * 100, pos.size, neg.size); sys.stdout.flush() dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']); dfz['s'] = 0 dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS) dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS) df = pd.concat([dfp, dfz, dfn]) df = pd.concat([df, likes_null], axis=1) df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat']) if save: path = utl.outpath + 'real/HMM/' utl.mkdir(path) df.to_pickle(path + 'h{:E}.df'.format(h)) return df
def get_specific_meta(): t = time.time() meta_df = pd.read_pickle('/home/max/Documents/project4/data/metadata_drop.pkl') t = print_time_elapsed(t, ' read meta') categories = ['Baby', 'Beauty', 'Books', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry', 'Home_and_Kitchen', 'Movies_and_TV', 'Automotive', 'CDs_and_Vinyl', 'Toys_and_Games', 'Video_Games'] df_list = [] for category in categories: print 'starting ' + category cut_df = pd.DataFrame() df = pd.read_pickle('/home/max/Documents/project4/data/' + category + '.pkl') category_asins = pd.DataFrame(pd.Series(np.unique(df['asin'])), columns = ['asin']) t = print_time_elapsed(t, ' read ' + category) print category_asins print df cut_df = pd.merge(category_asins, meta_df, on='asin', how='inner') print cut_df t = print_time_elapsed(t, ' merge meta ' + category) cut_df = pd.merge(df, cut_df, on='asin', how='inner') print cut_df t = print_time_elapsed(t, ' merge ' + category) cut_df['category'] = category cut_df = cut_df.drop_duplicates('reviewText') df_list.append(cut_df) print ' STRTING CONCAT ' big_df = pd.concat(df_list, ignore_index=True) t = print_time_elapsed(t, ' concat ') big_df = big_df.set_index('asin') big_df = big_df.sort_index() print 'starting write' big_df.to_pickle('/home/max/Documents/project4/data/all_merged.pkl') t = print_time_elapsed(t, ' write') return big_df
def createCADD(): " less 1000G_phase3_inclAnno.tsv.gz | cut -f1,2 > coord.hg19.tsv" ' bedtools intersect -sorted -a Kyrgyz.hg19.tsv -wb -b ../CADD/1000G_phase3_inclAnno.tsv > CADD.hg19.tsv ' cad=pd.read_csv(kutl.path+'data/CADD.hg19.tsv',sep='\t',header=None).iloc[:,3:].rename(columns={3:'CHROM',4:'POS'}).sort_values(['CHROM','POS']).set_index('CHROM') coor=pd.read_pickle(kutl.path+'data/map.df').dropna().apply(lambda x: x.astype(int)).set_index(19,append=True)[38].rename('POShg38') pd.read_pickle(kutl.path+'data/map.df').isnull().sum() cad.iloc[:10000].groupby(level=0).apply(lambda x: pd.merge(coor.loc[str(x.name)].sort_index().reset_index(),x,left_on=19,right_on='POS').iloc[:,2:] )
def plotWealthProcess(): startDate, endDate = date(2005,1, 1), date(2013, 12, 31) n_rvs = (5, 10) hist_periods = (20, 30 ,40 ,50 , 60 ,70 ,80) n_scenario = 200 alphas = ("0.5", "0.55", "0.6", "0.65", "0.7", "0.75", "0.8", "0.85", "0.9", "0.95") for n_rv in n_rvs: bh_wealthProcess = pd.read_pickle(os.path.join(ExpResultsDir, "buyhold_wealthprocess_n%s.pkl"%(n_rv))) for alpha in alphas: for hdx, hist_period in enumerate(hist_periods): paramDir = os.path.join(ExpResultsDir, "n%s_h%s_s%s_a%s"%(n_rv, hist_period, n_scenario, alpha)) expDirs = glob.glob(os.path.join(paramDir, "fixedSymbolSPPortfolio_20050103-20131231_*")) for rdx, expDir in enumerate(expDirs): t = time.time() runTime = expDir[expDir.rfind('_')+1:] wealthPkl = os.path.join(expDir, 'wealthProcess.pkl') depositPkl = os.path.join(expDir, 'depositProcess.pkl') if not os.path.exists(wealthPkl) or not os.path.exists(depositPkl): continue wealth = pd.read_pickle(wealthPkl) deposit = pd.read_pickle(depositPkl) #combine wealth['deposit'] = deposit tWealth = wealth.sum(axis=1)
def main(): from_str = args.cf.split('/')[-1].split('.')[0] df_file = args.pref + from_str + '_' + str(args.yr) \ + '_feat_matrix_unnormed.pkl' if args.split == None: df_raw = pd.read_pickle(df_file) else: # combine data matrix and score pickles split_files = glob.glob(args.pref + 'split_dm/*' + args.split + '*unnorm*.pkl') sort_files = natsort.natsorted(split_files) split_scores = glob.glob(args.pref + 'split_dm/*' + args.split + '*scores*.pkl') sort_scores = natsort.natsorted(split_scores) for ndx, pf in enumerate(sort_files): if ndx == 0: df_raw = pd.read_pickle(pf) scores = pd.read_pickle(sort_scores[ndx]) else: df_raw = df_raw.append(pd.read_pickle(pf)) scores = scores.append(pd.read_pickle(sort_scores[ndx])) # combine score pickles df_file_out = df_file.replace('_feat_matrix_unnormed', '_scores') scores.to_pickle(df_file_out) # cut columns of all zeros df_trim = df_raw[df_raw.columns[(df_raw != 0).any()]] # normalize on a per candidate basis df_trim_norm = df_trim.div(df_trim.sum(axis=1), axis=0) df_file_out = df_file.replace('unnormed', 'trim_normed') df_trim_norm.to_pickle(df_file_out)
def predict_live(batterattrfile, battermodelfile, batterdatafile, pitcherattrfile, pitchermodelfile, pitcherdatafile, predictionfile, na_treatment='zero'): # Apply model, save results batterattrs = read_attrs(batterattrfile)[1:] batter_model = pickle.load(open(battermodelfile, 'r')) batter_data = pd.read_pickle(batterdatafile) pitcherattrs = read_attrs(pitcherattrfile)[1:] pitcher_model = pickle.load(open(pitchermodelfile, 'r')) pitcher_data = pd.read_pickle(pitcherdatafile) if na_treatment == 'zero': usable_batter_data = batter_data[batterattrs].fillna(0) usable_pitcher_data = pitcher_data[pitcherattrs].fillna(0) elif na_treatment == 'drop': usable_batter_data = batter_data[batterattrs].dropna() usable_pitcher_data = pitcher_data[pitcherattrs].dropna() batter_data['prediction'] = pd.Series(batter_model.predict(usable_batter_data), index=usable_batter_data.index) pitcher_data['prediction'] = pd.Series(pitcher_model.predict(usable_pitcher_data), index=usable_pitcher_data.index) keep_cols = ['fullname', 'player_id', 'Position', 'Team', 'Salary', 'prediction'] batter_output = batter_data[keep_cols] pitcher_output = pitcher_data[keep_cols] pd.concat([batter_output, pitcher_output]).to_pickle(predictionfile)
def cargarPickle (filename): """ Esta funcion carga touchs y sounds en formato pickle. Englobna otras. Revisar superposicion """ import pandas as pd import os from IPython.display import display import sys if not sys.version_info[:2] == (3, 4): print ('Sos un boludo!, pero uno previsor') print ('Este codigo esta pensado para correr en python 3.4') if os.path.isfile('./Guardados/'+filename+'.touch'): touchs = pd.read_pickle ('./Guardados/'+filename+'.touch') else: display ('Error, no se encontro los touchs buscados') return if os.path.isfile('./Guardados/'+filename+'.sounds'): sounds = pd.read_pickle ('./Guardados/'+filename+'.touch') else: display ('Error, no se encontro los sounds buscados') return return touchs, sounds
def get_pkl_files(root_dir,pkl_dir,poem_file,vec_file,vectorizer_file): import pickle import pandas as pd df_poems = pd.read_pickle(root_dir + pkl_dir + '/' + poem_file) df_vecs = pd.read_pickle(root_dir + pkl_dir + '/' + vec_file) vectorizer = pickle.load( open( root_dir + pkl_dir + '/' + vectorizer_file, "rb" ) ) return df_poems, df_vecs, vectorizer
def fetch_data(district=None, from_pickle=False, pickle_filename=None, unit_col='student_id', time_col='grade_level'): if from_pickle and pickle_filename is not None: print("Reading pickle file.") data = pd.read_pickle(pickle_filename + '.pkl') if os.path.isfile(pickle_filename + '_cats' + '.pkl'): feature_categories = pd.read_pickle(pickle_filename + '_cats' + '.pkl') feature_categories = None else: # Retrieve time-invariant features, time-variant features, and outcome labels. cohorts, features_constant, features_by_time, feature_categories, labels = extract_data(district) features_by_time = features_by_time.drop(['cohort', 'academic_year'], 1) # Extract features. features = extract_features(features_constant, # time-invariant features features_by_time, # time-variant features unit_col=unit_col, # instance identifier column time_col=time_col, # time unit column ) # Extract outcome labels. labels = labels[['student_id', 'outcome_label']] labels = labels.dropna() # Extract instance-level data. Each instance has an identifier, one or more features, and a label. data = extract_instances(features, labels, unit_col='student_id') if pickle_filename is not None: data.to_pickle(pickle_filename + '.pkl') if feature_categories is not None: feature_categories.to_pickle(pickle_filename + '_cats.pkl') return data, feature_categories
def prep_for_modeling(): all = pd.read_pickle("./this_is_the_set_i_built_the_models_on_.pkl") with open("./exog_rf_.txt", "r") as f: temp = f.read() exog = temp.splitlines() all = bucketize(all, exog) with open("./exog_rf__.txt", "r") as f: temp = f.read() exog = temp.splitlines() train = all[all["train"] == 1] test = all[all["test"] == 1] validate = all[all["validate"] == 1] endog = get_variables("./expenses13_.txt") office = endog[0] outpatient = endog[1] er = endog[2] inpatient = endog[3] w = get_variables("weights13_.txt") w = w[0] insurance = pd.read_pickle("../data/insurance_current_.pkl") return ( all, train, test, validate, endog, office, outpatient, inpatient, er, w, exog, insurance)
def plot(y,MAP=True,fontsize=30,figsize=(18, 10), dpi=80): plt.figure(figsize=figsize, dpi=dpi) mpl.rc('font', **{'family': 'serif', 'serif': ['Times'], 'size':fontsize}) m =y.mean( axis=1) print m title=["MRR","MAP"][MAP] labels = ['GEO', 'Jaccard', 'R', 'RI', 'Pref1','Pref5','RIP1' ,'RIP5' ] x = range(len(labels)) error=y.std(axis=1)/2 pref=[pd.read_pickle(path+'prefFB{}.pkl'.format(i)) for i in [1,5]] print pref prefm=map(lambda x: x[('AP','MRR')[not MAP]]['mean'] , pref) prefs=map(lambda x: x[('AP','MRR')[not MAP]]['std'] /2, pref) print prefm m=np.append(m,prefm) error=np.append(error,prefs) rip=[pd.read_pickle(path+'RIP{}.pkl'.format(i)) for i in [1,5]] prefm=map(lambda x: x[('AP','MRR')[not MAP]]['mean'] , rip) prefs=map(lambda x: x[('AP','MRR')[not MAP]]['std'] /2, rip) m=np.append(m,prefm) error=np.append(error,prefs) plt.errorbar(x,m, yerr=error, fmt='ok',linewidth=2, markersize=15) plt.xlim([-1,len(x)]) plt.ylim([-0.01,max(m)+max(error)+0.05]) plt.grid() plt.xticks(x, labels) plt.title(title) plt.savefig(path+title+'.png') plt.show()
def concatDataFrames(): score_df = pd.read_pickle('../../dataset/score_df_tst.pickle') tf_idf_df = pd.read_pickle('../../dataset/score_df_tfidf_tst.pickle') lsa_df = pd.read_pickle('../../dataset/score_df_lsa_cvect_tst.pickle') # Read additional features from the result of feature_engineering # and append to score_df before saving it. # Read from file preprocessed_path = '../../dataset/features_t.csv' features_df = None should_add_features = False if os.path.isfile(preprocessed_path): print("Found Preprocessed DataFrame... Begin appending features to score matrix") features_df = pd.read_csv(preprocessed_path, index_col=0) feature_cols = list(features_df.columns.values) features_np_arr = np.array(features_df) should_add_features = True else: print("Not Found Preprocessed DataFrame") return None if should_add_features: features_df = pd.DataFrame(features_np_arr, index=score_df.index, columns=feature_cols) result = pd.concat([score_df, tf_idf_df, lsa_df, features_df], axis=1, ignore_index=True) print result.shape result.to_pickle('../../dataset/score_df_final_tst.pickle')
checkpoint(model, modelpath) print("End. Best Iteration {}: HR = {:.4f}, NDCG = {:.4f}. ".format( best_iter, best_hr, best_ndcg)) if save_model: print("The best MLP model is saved to {}".format(modelpath)) if save_model: if not os.path.isfile(resultsdfpath): results_df = pd.DataFrame(columns=[ "modelname", "best_hr", "best_ndcg", "best_iter", "train_time" ]) experiment_df = pd.DataFrame( [[modelfname, best_hr, best_ndcg, best_iter, train_time]], columns=[ "modelname", "best_hr", "best_ndcg", "best_iter", "train_time" ]) results_df = results_df.append(experiment_df, ignore_index=True) results_df.to_pickle(resultsdfpath) else: results_df = pd.read_pickle(resultsdfpath) experiment_df = pd.DataFrame( [[modelfname, best_hr, best_ndcg, best_iter, train_time]], columns=[ "modelname", "best_hr", "best_ndcg", "best_iter", "train_time" ]) results_df = results_df.append(experiment_df, ignore_index=True) results_df.to_pickle(resultsdfpath)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pandas as pd import os df = pd.read_pickle( os.path.join('/Users/amandashay/Documents/corepy', 'data_frame.pickle')) # Smaller object for easier vis small_df = df.iloc[49980:50019, :].copy() # Basic Excel small_df.to_excel("basic.xlsx") small_df.to_excel("no_index.xlsx", index=False) small_df.to_excel("columns.xlsx", columns=["artist", "title", "year"]) # Multiple worksheets writer = pd.ExcelWriter('multiple_sheets.xlsx', engine='xlsxwriter') small_df.to_excel(writer, sheet_name="Preview", index=False) df.to_excel(writer, sheet_name="Complete", index=False) writer.save() # Conditional formatting artist_counts = df['artist'].value_counts() artist_counts.head() writer = pd.ExcelWriter('colors.xlsx', engine='xlsxwriter') artist_counts.to_excel(writer, sheet_name="Artist Counts") sheet = writer.sheets['Artist Counts'] cells_range = 'B2:B{}'.format(len(artist_counts.index)) sheet.conditional_format(
from __future__ import division import bilby import numpy as np import matplotlib.pyplot as plt import pandas as pd from pylab import hist, diag import scipy.integrate as integrate from scipy.integrate import simps from scipy.special import gamma, factorial # A few simple setup steps label = 'linear_regression_unknown_noise' outdir = 'outdir_2_component_2nd_fixed_2' bilby.utils.check_directory_exists_and_if_not_mkdir(outdir) fdfs = pd.read_pickle("./Freq_small_df.pkl") Edata = fdfs["snr"] #Gauss components def gauss1(x, f, mu, sigma): C1 = 2 / ((2**(1 + (1 / 2))) * sigma * gamma(1 / 2)) return f * C1 * np.exp(-0.5 * (np.abs((x - mu) / sigma)**2)) def gauss2(x, f, mu, sigma, alpha): C2 = alpha / ((2**(1 + (1 / alpha))) * sigma * gamma(1 / alpha)) return (1 - f) * C2 * np.exp(-0.5 * (np.abs((x - mu) / sigma)**alpha)) def bimodal(x, f, mu1, sigma1, mu2, sigma2, alpha2):
'chain_path': map(lambda x: os.path.join(trainset_chain_dir, x), trainset_chain_dir_temp) }) testset = pd.DataFrame({ 'image_path': map(lambda x: os.path.join(testset_dir, x), testset_dir_temp), 'chain_path': map(lambda x: os.path.join(testset_chain_dir, x), testset_chain_dir_temp) }) trainset.to_pickle(trainset_path) testset.to_pickle(testset_path) else: trainset = pd.read_pickle(trainset_path) testset = pd.read_pickle(testset_path) #testset.index = range(len(testset)) #testset = testset.ix[np.random.permutation(len(testset))] is_train = tf.placeholder(tf.bool) learning_rate = tf.placeholder(tf.float32, []) ll = tf.placeholder(tf.int32, [batch_size]) net_size_tf = tf.placeholder(tf.int32, [net_size]) images_tf = tf.placeholder(tf.float32, [batch_size, net_size, net_size, 3], name="images") images_global = tf.placeholder(tf.float32, [batch_size, net_size, net_size, 3], name="images_global") #reconstruction_global=tf.placeholder( tf.float32, [batch_size, 128, 128, 3], name="rec_global") fake_length = 5
def process_run(config_file, label_function=None, kwargs={}): """Process one dataset from a config file using the prescribed label function""" global_scale = kwargs.get("global_scale") label_functions_pkg = kwargs.get("label_functions_pkg") data_filenames = kwargs.get("data_filenames") plot_raw = kwargs.get("plot_raw", False) cycle_key = kwargs.get("cycle_key", None) # Get data folder config = iter_utils.load_yaml(config_file) # Get the global scale global_scale = float(config.get("setup", {}).get("global_scale", 1.0)) if global_scale is not None: global_scale = global_scale global_scale_inv = 1.0 / global_scale # Set up the label functions if label_function is None: process = config.get("setup", {}) label_funs_to_get = process.get("label_functions", "default") else: label_funs_to_get = label_function default_fun = getattr(label_functions_pkg, "default") default_args = inspect.getfullargspec(default_fun)[0] if isinstance(label_funs_to_get, str): label_funs_to_get = [label_funs_to_get] if isinstance(label_funs_to_get, list): methods_to_call = dict() for curr_name in label_funs_to_get: curr_fun = getattr(label_functions_pkg, curr_name, None) if curr_fun is not None: args = inspect.getfullargspec(curr_fun)[0] methods_to_call[curr_name] = {"function": curr_fun, "args": args} if methods_to_call: label_function = methods_to_call else: label_function = { "default": {"function": default_fun, "args": default_args} } else: label_function = {"default": {"function": default_fun, "args": default_args}} # Get the names of the args specified in the label function label_fun_inputs = [] for label_fun_key in label_function: label_fun_inputs.extend(label_function[label_fun_key]["args"]) label_fun_inputs = list(set(label_fun_inputs)) # Get file locations folder = iter_utils.get_group_folder(config) print(folder) success_filename = os.path.join(folder, "summary.yaml") # Unzip the sweep sweep = config["sweep"] sweep_vars = [] sweep_labels = [] sweep_values = [] sweep_diffs = [] sweep_lookup = [] for param in sweep: sweep_vars.append(iter_utils.parse_variable_name(param["variable"])) sweep_values.append([]) if param.get("max", None) is not None: if param["num_steps"] > 1: sweep_diffs.append( (param["max"] - param["min"]) / (param["num_steps"] - 1) ) else: sweep_diffs.append(0.0) folder_param = param.get("folder", None) if folder_param is not None: folder_setup = os.path.join(folder_param, "sweep_values.yaml") if os.path.isfile(folder_setup): curr_lookup = iter_utils.load_yaml(folder_setup) f = {} f["names"] = [ os.path.join(folder_param, row["name"]) for row in curr_lookup["files"] ] f["values"] = [row["values"] for row in curr_lookup["files"]] curr_lookup["files"] = f sweep_lookup.append(curr_lookup) sweep_labels.append(curr_lookup["variables"]) else: sweep_lookup.append(None) else: sweep_lookup.append(None) sweep_labels.append( iter_utils.parse_variable_name(param.get("label", None)) ) print(sweep_vars) # Get the list of all folders run_folders = iter_utils.get_folders(folder) # Read in each data file and parse it label_vals = {} for key in label_function: label_vals[key] = [] num_finger_segs = [] for curr_folder in run_folders: print(curr_folder) param_filename = os.path.join(curr_folder, "params.yaml") params = iter_utils.load_yaml(param_filename) for idx, var in enumerate(sweep_vars): val = iter_utils.get_from_dict(params, var) if sweep_lookup[idx] is not None: try: num_idx = sweep_lookup[idx]["files"]["names"].index(val) val_use = sweep_lookup[idx]["files"]["values"][num_idx] except ValueError: val_use = val else: val_use = val sweep_values[idx].append(val_use) # Get object position data if needed if "objectpose" in label_fun_inputs: fields = [ "timeStamp", "objectId", "posX", "posY", "posZ", "oriX", "oriY", "oriZ", "oriW", ] pose_file = os.path.join(curr_folder, data_filenames["objectpose"]) reader = iter_utils.read_parse_data(pose_file) df = reader.make_dataframe(fields) for pos in ["posX", "posY", "posZ"]: df[pos] = global_scale_inv * df[pos] # Get euler angles from the quaternions euler = [] for quaternion in zip(df["oriX"], df["oriY"], df["oriZ"], df["oriW"]): # print(quaternion) euler.append(p.getEulerFromQuaternion(quaternion)) euler = np.array(euler) euler = np.unwrap(euler, axis=0) euler = np.rad2deg(euler) df["eulerX"] = euler[:, 0] df["eulerY"] = euler[:, 1] df["eulerZ"] = -euler[:, 2] df_rel = df - df.iloc[0].values.squeeze() if plot_raw and cycle_key is not None: act_file = os.path.join(curr_folder, data_filenames["actuation"]) iter_utils.graph_data( df_rel, filename=pose_file, cyc_filename=act_file, cyclic_key=cycle_key, ) # iter_utils.graph_cyclic(df, act_file, cycle_key) else: df = None # Get contact data if needed if "contact" in label_fun_inputs: filename_contact = os.path.join(curr_folder, data_filenames["contact"]) if os.path.exists(filename_contact): fields = [ "timeStamp", "stepCount", "bodyUniqueIdA", "bodyUniqueIdB", "linkIndexA", "linkIndexB", ] reader = iter_utils.read_parse_data(filename_contact) df_contact = reader.make_dataframe(fields) else: df_contact = None else: df_contact = None # Get actuation data if needed if "actuation" in label_fun_inputs: filename_actuation = os.path.join(curr_folder, data_filenames["actuation"]) if os.path.exists(filename_actuation): df_actuation = pd.read_pickle(filename_actuation) for col in df_actuation.columns.values: if "actuation" in col: df_actuation[col] = pow(global_scale_inv, 2) * df_actuation[col] else: df_actuation = None else: df_actuation = None # Get the number of finger segments calc_file = os.path.join(curr_folder, data_filenames["calculated"]) calc_params = iter_utils.load_yaml(calc_file) num_finger_segs.append(calc_params.get("num_finger_segs", [])) # package the correct data to give to the label function label_fun_send_list = { "objectpose": df, "contact": df_contact, "actuation": df_actuation, } # Get the labels from the label functions for label_fun_key in label_function: label_fun_send = dict() for key in label_function[label_fun_key]["args"]: label_fun_send[key] = label_fun_send_list[key] curr_val = label_function[label_fun_key]["function"](**label_fun_send) label_vals[label_fun_key].append(curr_val) if "save_raw_data" in label_function.keys(): out = {} if df is not None: out["objectpose"] = df.to_dict(orient="list") if df_contact is not None: out["contact"] = df_contact.to_dict(orient="list") if df_actuation is not None: out["actuation"] = df_actuation.to_dict(orient="list") out_file = os.path.join(curr_folder, "raw_data.pkl") with open(out_file, "wb") as f: pickle.dump(out, f) results = dict() results["labels"] = label_vals results["vars"] = sweep_vars results["varlabels"] = sweep_labels results["sweep"] = sweep_values results["diffs"] = sweep_diffs results["num_finger_segs"] = num_finger_segs iter_utils.save_yaml(results, success_filename) data = flatten_data(results) filename, ext = os.path.splitext(success_filename) iter_utils.save_yaml(data, filename + "_flattened" + ext) return results
help='Find the n most dissimilar items') parser.add_argument('-c', '--conf_info', type=str, default=None, help='tsv file containing ccs and energy info for conformers. \ Note that the index of the .tsv file must exactly match the index of the matrix') args = parser.parse_args() n = args.ndis mtrx = args.mtrx # If SDS is not already a directory, make it directory = 'SDS' if not exists(directory): os.makedirs(directory) df = pd.read_pickle(mtrx) SDSdf = SDS(df, n=n) narray = np.array([x for x in range(1, n+1)]) # If comparing conformers, calculate Boltzmann weighted CCS. if args.conf_info != None: csvdf = pd.read_csv(args.conf_info) writedf = conf_to_ccs(SDSdf['matrix index'].values, csvdf) else: writedf = SDSdf writedf['n Dissimilar'] = narray writedf.to_csv(f'SDS/SDS_{n}_dissimilar.csv', index=False) print((time()-start)/60, 'min')
if args.no_sweeps: fn = fn[:-7] + '_nosweep.pickle' if not os.path.isfile(fn) or args.regenerate: patients = ['p1', 'p2', 'p5', 'p6', 'p8', 'p9', 'p11'] cov_min = 100 data = collect_data(patients, cov_min=cov_min, no_sweeps=args.no_sweeps) try: data.to_pickle(fn) print('Data saved to file:', os.path.abspath(fn)) except IOError: print('Could not save data to file:', os.path.abspath(fn)) else: data = pd.read_pickle(fn) # Make time and entropy bins t_bins = np.array([0, 100, 200, 500, 1000, 1500, 2000, 3000], int) t_binc = 0.5 * (t_bins[:-1] + t_bins[1:]) add_binned_column(data, t_bins, 'time') data['time_binc'] = t_binc[data['time_bin']] # No-entropy sites are many, so the bin 0 comes up twice perc = np.linspace(0, 100, 8) S_bins = np.percentile(data['S'], perc)[1:] S_binc = np.percentile( data['S'], 0.5 * (perc[:-1] + perc[1:]))[1:] # this makes bin center medians. n_alleles = np.array( data.loc[:, ['af', 'S_bin']].groupby('S_bin').count()['af'])
cvs = cross_val_score(rfr_here, crossval_X, crossval_y, cv=cv_groups, n_jobs=n_jobs, scoring='mean_absolute_error', fit_params={'sample_weight': crossval_weights}) msg("Cross validation took %f seconds with %i threads, %i records, %i estimators and %i CV groups" % ((time.time() - begin_time), n_jobs, len(crossval_X), n_estimators, cv_groups)) msg("Results: %f, %s" % (np.mean(cvs), str(cvs))) return np.mean(cvs) msg("Hi, reading moves.") moves_df = read_pickle(sys.argv[1]) moves_file = open(sys.argv[1] + '.info', 'rb') moves_info = pickle.load(moves_file) categorical_features = moves_info['categorical_features'] msg("Computing weights") game_weights = (1. / (moves_df.groupby('gamenum')['halfply'].agg({ 'max': np.max }).clip(1, 1000)))['max'] moves_df['weight'] = moves_df['gamenum'].map(game_weights) msg("Done") #moves_df['abs_moverscore'] = moves_df['moverscore'].abs() features_to_exclude = [
rel_report_dir = os.path.join(report_str + '_TR_%s' % TR) os.chdir(report_base_dir) if os.path.isdir(rel_report_dir): shutil.rmtree(rel_report_dir) os.mkdir(rel_report_dir) os.chdir(rel_report_dir) os.mkdir('reports') for subject_id in subjects_list: print(subject_id) df_ss_file = os.path.join(ds_dir, subject_id, 'rsfMRI_preprocessing/QC/df', 'TR_%s' % TR, 'qc_values.pkl') #fixme if os.path.exists(df_ss_file): df_ss = pd.read_pickle(df_ss_file) else: header = [ 'subject_id', 'similarity_epi_struct', 'similarity_struct_MNI', 'mean_FD_Power', 'n_spikes', 'median_tsnr' ] data = np.hstack((subject_id, np.repeat(np.nan, len(header) - 1))) df_ss = pd.DataFrame([data], columns=header) df_ss = df_ss.set_index(df_ss.subject_id) # link to report pdf: rel_report_dir = os.path.join(report_str + '_TR_%s' % TR) subject_reports_dir = os.path.join(rel_report_dir, 'reports') report_file = os.path.join(subject_reports_dir, subject_id + '.pdf') df_ss['report_file'] = report_file
values = round.next() if x == 0: image = values[0] for i in values[1:]: labels.append(i) else: image = np.add(image, values[0]) x += 1 yield(image, labels) #get directory of input images and create array of images and store images in the directory to the array train_dir = "C:/pooled/Train" #get labels pickle and convert to dataframe then sort by the filename to go along with the images train_labels_file = "C:/Users/panka/OneDrive/Desktop/Aditya/image data 2018-19/Training_Input_Resized.pkl" train_labels = pd.read_pickle(train_labels_file) train_datagen = ImageDataGenerator(rescale=1./255) train_generator = train_datagen.flow_from_dataframe(dataframe=train_labels,directory=train_dir,target_size=(108,192),x_col='Filename',y_col=['Right Ankle x','Right Knee x','Right Hip x','Left Hip x','Left Knee x','Left Ankle x','Pelvis x','Thorax x','Upper Neck x','Head Top x','Right Wrist x','Right Elbow x','Right Shoulder x','Left Shoulder x','Left Elbow x','Left Wrist x','Right Ankle y','Right Knee y','Right Hip y','Left Hip y','Left Knee y','Left Ankle y','Pelvis y','Thorax y','Upper Neck y','Head Top y','Right Wrist y','Right Elbow y','Right Shoulder y','Left Shoulder y','Left Elbow y','Left Wrist y'],class_mode='other',batch_size=16) #get directory of input images and create array of images and store images in the directory to the array test_dir = "C:/pooled/Test" #get labels pickle and convert to dataframe then sort by the filename to go along with the images test_labels_file = "C:/Users/panka/OneDrive/Desktop/Aditya/image data 2018-19/Testing_Input_Resized.pkl" test_labels = pd.read_pickle(test_labels_file) test_datagen = ImageDataGenerator(rescale=1./255) test_generator = test_datagen.flow_from_dataframe(dataframe=test_labels,directory=test_dir,target_size=(108,192),x_col='Filename',y_col=['Right Ankle x','Right Knee x','Right Hip x','Left Hip x','Left Knee x','Left Ankle x','Pelvis x','Thorax x','Upper Neck x','Head Top x','Right Wrist x','Right Elbow x','Right Shoulder x','Left Shoulder x','Left Elbow x','Left Wrist x','Right Ankle y','Right Knee y','Right Hip y','Left Hip y','Left Knee y','Left Ankle y','Pelvis y','Thorax y','Upper Neck y','Head Top y','Right Wrist y','Right Elbow y','Right Shoulder y','Left Shoulder y','Left Elbow y','Left Wrist y'],class_mode='other',batch_size=16) #create model model = Sequential()
def word2vec_inception(): return concat_ser_dic(pd.read_pickle("./word2vec"), pd.read_pickle("./inception"))
df0 = df_pressures(xds_ibtracs) df0[6000:6010] #path to your daily mean SST and MLD data path_sst = r'/media/administrador/SAMSUNG/seasonal_forecast/data/SST/' path_mld = r'/media/administrador/SAMSUNG/seasonal_forecast/data/CFS/ocnmld/' path_p = r'/home/administrador/Documentos/seasonal/seasonal_forecast/new/' **For the calibration period the points with pressure, SST and MLD data in the target area are kept.** df = df_p_sst_mld(df0,path_sst,path_mld) df_cali = df.drop(df.index[5184:]) #years of the calibration period # load data path_p= r'/home/administrador/Documentos/seasonal/seasonal_forecast/new/' df = pd.read_pickle(path_p+'df_coordinates_pmin_sst_mld_2019.pkl') df.tail() ## <font color='royalblue'>**3.2 Predictor grid and data processing**</font> <a name="pg"></a> <br /> **The historical datasets are interpolated into the a 1/2º grid resolution, defining this way the grid for the predictor in the target area.** fig_predictor_grid = plot_predictor_grid() **MLD, SST and pressure data plots:** plot_sst_mlp_pmin_cali(df)
import pandas as pd from ecg_qc import ecg_qc import math from tqdm import tqdm time_window_ml = 4 fs = 1000 df_ecg = pd.read_pickle('dataset_streamlit/df_ecg_103001_selection.pkl') df_ecg.head() ecg_data = df_ecg['ecg_signal'][10000:] ecg_qc_ml = ecg_qc( normalized=True, model= '/home/aura-alexis/github/ecg_qc_viz/env2/lib64/python3.6/site-packages/ecg_qc-1.0b4-py3.6.egg/ecg_qc/ml/models/xgb_norm_{}s.joblib' .format(time_window_ml), data_encoder= '/home/aura-alexis/github/ecg_qc_viz/env2/lib64/python3.6/site-packages/ecg_qc-1.0b4-py3.6.egg/ecg_qc/ml/data_encoder/data_encoder_norm_{}s.joblib' .format(time_window_ml)) df_results = df_ecg df_results['ml'] = '' for ecg_signal_index in tqdm( range(math.floor(ecg_data.shape[0] / (fs * time_window_ml)) + 1)): start = ecg_signal_index * fs * time_window_ml end = start + fs * time_window_ml ml_prediction = ecg_qc_ml.get_signal_quality(ecg_data[start:end].values) df_results['ml'].iloc[start:end] = ml_prediction
def doc2vec_word2vec_inception(): tmp = concat_ser_dic(pd.read_pickle("./doc2vec"), pd.read_pickle("./word2vec")) return concat_ser_dic(tmp, pd.read_pickle("./inception"))
def test_pickle_method(self): filename = os.path.join(self.tempdir, "df.pkl") self.df.to_pickle(filename) unpickled = pd.read_pickle(filename) assert_frame_equal(self.df, unpickled) assert self.df.crs == unpickled.crs
def doc2vec_word2vec(): return concat_ser_dic(pd.read_pickle("./doc2vec"), pd.read_pickle("./word2vec"))
import pandas as pd model = pd.read_pickle('IRIS_Model.bin') sl = float(input('Enter Sepal_length(4.3 - 8.0) : ')) sw = float(input('Enter Sepal_width(2.0 - 4.4) : ')) pl = float(input('Enter Petal_length(1.0 - 7.0) : ')) pw = float(input('Enter Petal_width(0.1 - 2.5) : ')) result = model.predict([[pl, pw, sl * pw, sl * pl, sw * pl, sw * pw, pl * pw]]) if result == 1: result = 'Setosa' elif result == 0: result = 'Virginica' elif result == 2: result = 'Versicolor' print('According to Your information this flower belongs to {} species'.format( result)) #print(result)
# Author Shael Minuk import pickle as pkl import pandas as pd sim_data = pd.read_pickle("sim_results.pkl") sim_data["sub_thick"] = 0 sim_data["sub_perm"] = 0 ## just changing the subtrate to map its thickness and permeativity, make it numerical and not categorical sim_data.loc[(sim_data[9] == "Rogers RO3003"), "sub_thick"] = 1.52e-3 sim_data.loc[(sim_data[9] == "Rogers RO3010"), "sub_thick"] = 1.28e-3 sim_data.loc[(sim_data[9] == "Rogers RO3003"), "sub_perm"] = 3 sim_data.loc[(sim_data[9] == "Rogers RO3010"), "sub_perm"] = 10 sim_data = sim_data.drop([0, 1, 6, 9], axis=1) # one hot encode the categorical top mid bottom collumn :), now boolean 5_bot, 5_mid, 5_top etc etc instead of one col sim_data = pd.get_dummies(sim_data, ) # store processed pickled data in model folder (dataframe) sim_data.to_pickle("processed_data.pkl")
cityFilter = [ 'livorno', ] # 'nice'] cityList = list(filter(lambda a: a not in cityFilter, cityList)) cityList.append('barcellona') print(n) print(len(cityList)) print(len(cityList)) return citiyDic, cityList inPath = '../data/' df_librettos = pd.read_pickle(inPath + 'librettos_1.pkl') filter_pot_city = [ 'casale', 'vittoria', 'desio', 'nola', 'bali', 'mira', 'sora', 'sora', 'genzano', 'faro' ] european_dic, european_cities = cityDic() italian_dic, italian_cities = cityDicItaly() european_dic = {**european_dic, **italian_dic} city_names = df_librettos.pot_city_name.tolist() long = df_librettos.longitude.tolist()
v = unit_transform(arg) row = [country_a, country_b, item_no, date_list[i], v] allrow.append(row) return allrow def remake(df, item_no, value_header): ''' 整理 table_source df 並回傳 DataFrame df : pd.read_html(table_source) item_no : str 產品項目, ex: "020711", "020712" , "020714", "020742" value_header :str 貿易資訊, ex: "ex_qty","im_qty","ex_val","im_val" ''' columns = ['country_a', 'country_b', 'item_no', 'date', value_header] records = [] for row_no in range(2, len(df)): # 資料從第三行開始 allrow = parser_row_val(df, row_no, item_no) for row in allrow: records.append(row) df2 = pd.DataFrame.from_records(records, columns=columns) return df2 if __name__ == "__main__": df = pd.read_pickle('./pickle/test.pickle') df2 = remake(df, "020712", "im_qty") print(df2)
from math import * from sklearn.mixture import GaussianMixture from sklearn.cross_validation import train_test_split from sklearn.preprocessing import MultiLabelBinarizer from pandas import DataFrame from gensim.models import KeyedVectors filename = sys.argv[1] dims = int(sys.argv[2]) # Word Vectors Glove = KeyedVectors.load(filename) start = time.time() all = pd.read_pickle('all.pkl') # Computing tf-idf values. traindata = [] for i in range( 0, len(all["text"])): traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(all["text"][i], True))) tfv = TfidfVectorizer(strip_accents='unicode',dtype=np.float32) tfidfmatrix_traindata = tfv.fit_transform(traindata) featurenames = tfv.get_feature_names() idf = tfv._tfidf.idf_ # Creating a dictionary with word mapped to its idf value print "Creating word-idf dictionary for Training set..." word_idf_dict = {}
mask=masks[0], drift_model=None, # Already done by fmriprep smoothing_fwhm=5.0, hrf_model='spm + derivative', n_jobs=10, subject_label='{}.{}'.format(ds, subject)) model.fit(images, behavior, confounds) models.append(model) mask = fsl.Info.standard_image('MNI152_T1_2mm_brain_mask.nii.gz') confounds = pd.read_pickle(op.join(derivatives, 'all_subjectwise_parameters.pkl')) confounds = confounds[['ddm difficulty_effect', 'ddm z_cue_regressor']] confounds = confounds.groupby('dataset').transform(lambda x: (x - x.mean())/ x.std()) confounds['subject_label'] = confounds.apply(lambda row: '{}.{}'.format(row.name[0], row.name[1]), 1) confounds['ds'] = confounds.index.get_level_values('dataset').map({'ds-01':0, 'ds-02':1}) confounds = confounds.reset_index(drop=True) model2 = SecondLevelModel(mask) model2.fit(models, confounds=confounds) glm_dir = op.join(derivatives, 'both', 'modelfitting', 'glm_4', 'shift-{}'.format(shift)) if not op.exists(glm_dir):
from settings import FB_ALL, BASELINE_BEFORE, BASELINE_AFTER import pandas as pd import seaborn as sns from pingouin import rm_corr, mixed_anova, pairwise_ttests, rm_anova, plot_paired, friedman, anova, ttest import pylab as plt import numpy as np from mne.stats import fdr_correction sns.set_context("paper") sns.set_style("dark") threshold = 2.125 stats_file = 'baseline_block_stats_1channels_1bands_median_20ths.pkl' stats_df_all = pd.read_pickle('data/{}'.format(stats_file)) # stats_df = stats_df.loc[stats_df.subj_id!=28] stats_df_all = stats_df_all.loc[stats_df_all['block_number'].isin( [BASELINE_AFTER, BASELINE_BEFORE])] unique_blocks = list(stats_df_all['block_number'].unique()) stats_df_all = stats_df_all.loc[stats_df_all['threshold_factor'] == threshold] stats_df_all['baseline'] = stats_df_all['block_number'].apply( lambda x: 'After' if x > 10 else 'Before') fb_types = ['FB0', 'FB250', 'FB500', 'FBMock'] stats_df_all = stats_df_all.loc[stats_df_all['fb_type'].isin(fb_types)] metric_type = 'n_spindles' res = mixed_anova(stats_df_all.query('metric_type=="{}"'.format(metric_type)), dv='metric', within='baseline', subject='subj_id',
def update_stock_prizes(self, ListOfTickers=None): """ Update stock prizes given in ListOfCompanies using yahoo finance If there is data to update the old file is backuped to .../backup/stockTicker.p so the backup is good for one business day """ if ListOfTickers is None: ListOfTickers = self.ListOfCompanies['Yahoo Ticker'] print "Start updating stock prizes" print "--------------------------------------\n" self.UpdateTimeEnd = datetime.datetime.today().date() print "Today is ", self.UpdateTimeEnd, "\n" notUpdated = [] for stocklabel in ListOfTickers: if os.path.isfile(self.PathData + 'raw/stocks/' + stocklabel + '.p'): StockValue = pd.read_pickle(self.PathData + 'raw/stocks/' + stocklabel + '.p') self.UpdateTimeStart = StockValue.tail( 5)['Date'].tolist()[0].date() #if stock has been updated at the same date already if self.UpdateTimeStart == self.UpdateTimeEnd: self.logging( "Stock " + stocklabel + ": UpdateTimeStart is equal to UpdateTimeEnd ") continue try: stock_prize = pdr.get_data_yahoo(stocklabel, self.UpdateTimeStart, self.UpdateTimeEnd) stock_prize.dropna(inplace=True) stock_prize.drop(index=stock_prize.loc[ stock_prize['Volume'] == 0.0].index.tolist(), inplace=True) stock_prize.reset_index(inplace=True) #print stock_prize stock_prize = stock_prize.loc[ stock_prize['Date'] >= self.UpdateTimeStart] if len(stock_prize) == 0: self.logging("Stock " + stocklabel + ": no new data available") continue StockValue = pd.concat([ StockValue.loc[ StockValue['Date'] < self.UpdateTimeStart], stock_prize ], ignore_index=True) shutil.copy( self.PathData + 'raw/stocks/' + stocklabel + '.p', self.PathData + 'raw/stocks/backup/' + stocklabel + '.p') #print "number of rows", len(StockValue), " for label", stocklabel StockValue.reset_index(inplace=True, drop=True) StockValue.to_pickle(self.PathData + 'raw/stocks/' + stocklabel + '.p') print "Stock ", stocklabel, " updated" self.logging("Stock " + stocklabel + ": successfully updated") except RemoteDataError: self.logging("Stock " + stocklabel + ": No information for ticker found") print "No information for ticker ", stocklabel notUpdated.append(stocklabel) continue except SSLError: self.logging("Stock " + stocklabel + ":SSLError") notUpdated.append(stocklabel) continue except ConnectionError: self.logging("Stock " + stocklabel + ": ConnectionError") notUpdated.append(stocklabel) continue except IndexError: self.logging("Stock " + stocklabel + ": IndexError") notUpdated.append(stocklabel) continue else: #if file is not available yet get data starting from 01/01/2000 self.UpdateTimeStart = datetime.datetime(2000, 1, 1).date() try: stock_prize = pdr.get_data_yahoo(stocklabel, self.UpdateTimeStart, self.UpdateTimeEnd) stock_prize.drop(index=stock_prize.loc[ stock_prize['Volume'] == 0.0].index.tolist(), inplace=True) stock_prize.dropna(inplace=True) stock_prize = stock_prize.reset_index() #print stock_prize stock_prize.to_pickle(self.PathData + 'raw/stocks/' + stocklabel + '.p') print "Stock ", stocklabel, " updated" self.logging("Stock " + stocklabel + ": successfully updated") except RemoteDataError: self.logging("Stock " + stocklabel + ": No information for ticker found") print "No information for ticker ", stocklabel continue except SSLError: self.logging("Stock " + stocklabel + ":SSLError") notUpdated.append(stocklabel) continue print "\nFinished updating stock prizes\n\n" if len(notUpdated) > 0: print "Not updated stocks", notUpdated self.logging("Not updated stocks " + str(notUpdated)) return notUpdated else: return None
def get_movie_feature(): df = pd.read_pickle(os.path.join(config.DIR_DATA, 'movie_feature_pub.pkl')) return df
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.metrics import classification_report from sklearn.svm import OneClassSVM from sklearn.pipeline import Pipeline # In[2]: class dataset: pass sample_data = pd.read_csv("D:\KULIAH\Semester 8\Dataset\Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv") sample_data.to_pickle('D:\KULIAH\Semester 8\Dataset\Thursday-15-02-2018_TrafficForML_CICFlowMeter.pkl') # In[3]: df = pd.read_pickle('D:\KULIAH\Semester 8\Dataset\Thursday-15-02-2018_TrafficForML_CICFlowMeter.pkl') df = df[['URG Flag Cnt','SYN Flag Cnt','RST Flag Cnt','PSH Flag Cnt','Protocol', 'Pkt Size Avg','Flow Pkts/s','FIN Flag Cnt','ECE Flag Cnt','ACK Flag Cnt','Dst Port','Label']] df["Flow Pkts/s"] = pd.to_numeric(df["Flow Pkts/s"], errors='coerce') df.dropna(inplace=True) df.info(verbose=True) # In[5]: dataset.train = df.groupby('Label') .apply(pd.DataFrame.sample, frac=0.8) .reset_index(level='Label', drop=True) dataset.test = df.drop(dataset.train.index) dataset.label = dataset.train.Label.copy() # In[6]: dataset.train
def get_ratings(): df = pd.read_pickle(os.path.join(config.DIR_DATA, 'ratings_pub.pkl')) return df
def get_question3_ref(): df_ref_movie_feature = pd.read_pickle(os.path.join(config.DIR_DATA, 'ref_movie_feature.pkl')) return df_ref_movie_feature
) / df[abbv + ' SA Value'][0] * 100.0 if main_df.empty: main_df = df else: main_df = main_df.join(df) print(main_df.head()) pickle_out = open('fiddy_states3.pickle', 'wb') pickle.dump(main_df, pickle_out) pickle_out.close() def HPI_Benchmark(): df = quandl.get("FMAC/HPI_USA", authtoken=api_key) df['NSA Value'] = (df['NSA Value'] - df['NSA Value'][0]) / df['NSA Value'][0] * 100.0 df['SA Value'] = (df['SA Value'] - df['SA Value'][0]) / df['SA Value'][0] * 100.0 return df m30 = mortgage_30yr() HPI_data = pd.read_pickle('fiddy_states3.pickle') HPI_bench = HPI_Benchmark() state_HPI_M30 = HPI_data.join(m30) print(state_HPI_M30.corr()['M30'].describe())
def get_likes(): df = pd.read_pickle(os.path.join(config.DIR_DATA, 'likes_pub.pkl')) return df