def begin_session(path_to_csv, baseline, mean_fxn, var_fxn): if (os.path.isfile(path_to_csv)): history_df = pd.read_csv(path_to_csv) nr = history_df.shape[0] else: nr = 0 mu = mean_fxn(nr) sigma = var_fxn(nr, baseline) #sample from dist mins = baseline + max(-baseline + 1, np.random.normal(mu, sigma)) #Session os.system("say Begin Practice") print(mins) time.sleep(60 * mins) os.system("say End Practice") #record stages stage = input( "Which stage of meditation did you enter during this practice at its acme?" ) date = input("What is the date?") newrow = {"Date": [date], "Stage": [stage], "Duration": [mins]} if (nr == 0): history_df = newrow else: history_df = history_df.append(newrow) pd.write_csv(history_df, path_to_csv) return
def get_target_annotations(pset, annot_dir): """ Annotate a the 'TARGET' in the 'drug' slot of a PSet object using mapping from the UniProt idenitifer mapping tool API. :param pset: :param annot_dir: :return: """ # Read in drug target annotations and gene annotations drug_targets = pd.read_csv( os.path.join(annot_dir, 'drugbank_drug_targets_all.csv')) rnaseq_df = pset.get("molecularProfiles").get( "Kallisto_0.46.1.rnaseq").get("elementMetadata") # Map genes to drugbank drug ids genes_to_drugs = pd.merge( drug_targets.loc[:, ['Name', 'Gene Name', 'Drug IDs']], rnaseq_df.loc[:, ['gene_name', 'gene_id']], left_on='Gene Name', right_on='gene_name') # Annotate the genes # Expand list columns into rows and annotate drugs genes_to_drugs['Drug IDs'] = [ str.split(ids, '; ') for ids in genes_to_drugs['Drug IDs'].values ] genes_to_drugs = genes_to_drugs.explode('Drug IDs') # Write to disk if necessary. file_path = os.path.join(annot_dir, 'drugbank_drug_to_gene_mappings.csv') if not os.isfile(file_path): pd.write_csv(genes_to_drugs, file_path) pass
def wide_to_long_county(): df = pd.read_csv("~/Documents/GitHub/lqycovid/data-scripts/_1p3a/_working/testing_county_raw.csv") df.set_index(['GEOID', 'criteria'], inplace=True) df = df.stack(level = 0).reset_index(level = 1, drop = False).reset_index() df['level_1'] = df.level_1.astype(str) df.rename(columns={'level_1':'date', 0:'testing'}, inplace = True) df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d') df['date'] = df.date.astype(str) pd.write_csv(df, "testing_county_raw.csv") dp = pd.read_csv("~/Documents/GitHub/lqycovid/data-scripts/_1p3a/_working/county_positivity_raw.csv") dp.set_index(['GEOID', 'criteria'], inplace=True) dp = dp.stack(level = 0).reset_index(level = 1, drop = False).reset_index() dp['level_1'] = dp.level_1.astype(str) dp.rename(columns={'level_1':'date', 0:'positivity'}, inplace = True) dp['date'] = pd.to_datetime(dp['date'], format = '%Y-%m-%d') dp['date'] = dp.date.astype(str) pd.write_csv(dp, "county_positivity_raw.csv")
class MidTerm(self): def __init__(self): # Form training set ts = pd.read_csv('training_set.csv')#self.form_training_set() # Upload to AWS S3 bucket training_set.csv = pd.write_csv(ts) self.upload(training_set.csv, sys.argv[1], sys.argv[2]) # Data cleaning ts = self.clean(ts)
def ReadSQLServertoFileStore(file, schema, tableName, engine): df = pd.read_sql(tableName, schema=schema, con=engine) df = pd.write_csv(file) return df
# nltk.download('wordnet') # nltk.download('punkt') # nltk.download('averaged_perceptron_tagger') from nltk.stem import WordNetLemmatizer xDIR = 'C:/Users/V574361/Documents/bigdata/files' #def tdm(): # Create some very short sample documents # Initialize class to create term-document matrix tdm = textmining.TermDocumentMatrix() # Add the documents for f in os.listdir(xDIR): #print(f) tdm.add_doc(open(os.path.join(xDIR,f), encoding="utf8").read()) #for row in tdm.rows(cutoff=1): # print(row) #print(tdm) # return tdmm #x = tdm() tdm.write_csv(filename="C:/Users/V574361/Documents/bigdata/tdm.test.csv") import pandas import pandas as pd cereal_df = pd.read_csv("C:/Users/V574361/Documents/bigdata/tdm.test.csv") tdm=cereal_df.T pd.write_csv(tdm,"C:/Users/V574361/Documents/bigdata/tdm.test1.csv") tdm.to_csv("C:/Users/V574361/Documents/bigdata/tdm.test1.csv", sep=',', encoding='utf-8')
#coding:utf-8 #author:L.P import pandas as pd file = pd.read_table("test.txt",names=["ID","string"]) for s in file["string"]: file["string"] = 1 sub = pd.write_csv("submit.csv",sep=",")
plantings = np.load('Plantings_NEW.npy') ids_ordered = np.load('Ecotype_ids_NEW_ORDERED.npy') ids_unique = np.unique(ids_ordered) logical = ibs.columns.isin(ids_unique.flatten( )) #just keep those columns whose ecotypes we have in the dataset ibs = ibs.loc[logical, logical] k_matrix = pd.DataFrame(data=None, index=ids_ordered, columns=ids_ordered) for i in ibs.columns: for j in ibs.index: a = ibs.loc[j, i] #get the data value k_matrix.loc[j, i] = a pd.write_csv('k2030_emmaxibsmatrix.csv', k_matrix) k_np = np.array(k_matrix) np.save('k2030_emmaxibsmatrix.npy', k_matrix) #Conforming the EMMAX aBN (Balding-Nichols) matrix import os os.chdir( "/srv/uom-data1-q.unimelb.edu.au/6300-afournier/home/student.unimelb.edu.au/andhikap/Clim_GWAS/EMMAX" ) import pandas as pd ibs = pd.read_csv('k2029_emmax.aBN.csv', sep='\t', header=None) tfam = pd.read_csv('k2029_emmax.tfam', sep=' ', header=None) ecotype_ids = tfam.loc[:, 0] ibs.columns = ecotype_ids ibs.index = ecotype_ids
def filewrite(file): data = pd.write_csv(file) return data
def write_csv(csv_file): return pd.write_csv(csv_file)
if 'regeocode' in response.text: #address_detail0=re.search('"addressComponent":(.*?)',response.text).group(1) address_detail=re.search('"addressComponent":(.*),(.*),(.*),(.*)',response.text).group(1) address_dict['city']=json.loads(address_detail)['city'] address_dict["province"]=json.loads(address_detail)["province"] address_dict["district"]=json.loads(address_detail)["district"] address_dict["township"]=json.loads(address_detail)["township"] else: pass return address_dict data['city']=data.loncationdetail.apply(get_data).apply(lambda x:x['city']) data['province']=data.loncationdetail.apply(get_data).apply(lambda x:x['province']) data['district']=data.loncationdetail.apply(get_data).apply(lambda x:x['district']) data['township']=data.loncationdetail.apply(get_data).apply(lambda x:x['township']) pd.write_csv() data.head() #显示含有空值的列,每一列有多少个缺失值 #data[['city','province','district','township']].isnull().sum()他的结果显示没有空值 #应该是查询含有空列表的数目,data['city']获得序列,data[['city']]获得数据框 #在数据框上使用apply返回每一列或每一行的最后汇总值,在序列上使用apply返回每一个数的执行结果 #(data[['city']].apply(len)==0).sum();Out[256]: 0 #(data['city'].apply(len)==0).sum();Out[257]: 294 np.isnan(data['city']).sum()#返回的是有几个数,不是每个数的长度 for i in ['city','province','district','township']:#查看含有空列表的列 print(str(i),':',(data[i].apply(lambda x:len(x)==0)).sum()) #查看含有city列为空列表的行索引,并且说明缺失是因为直辖市的原因 pd.Series((data[data.province.str.contains('市')].index)==data[data['city'].apply(lambda x:len(x)==0)].index).value_counts() #补全city为空列表的值
else: 0.0 prices_shrt['return'] + 1 prices_shrt.groupby(by='ticker')['return'].prod() #too long #EXPLORE OPTION 2, FOR LOOPING EVERY TICKER AND USING SHIFT(1) METHOD!!!!!!! prices_shrt['return'] = 1.0 %%time for i in range(0,len(prices_shrt)): if prices_shrt['ticker'][i-1] == prices_shrt['ticker'][i]: prices_shrt['return'][i] = prices_shrt["adj_close"][i]/prices_shrt["adj_close"][i-1] else: 0.0 pd.write_csv(prices_shrt,"returns.csv") fif.groupby(by='ticker')['return'].prod() #http://stackoverflow.com/questions/40273251/pandas-groupby-with-pct-change fif['return'] = fif.groupby(['ticker', 'year']).pct_change() prices_shrt['return'] = prices_shrt.groupby(['ticker', 'year']).pct_change() fuit = df.groupby(['ticker','year','month'],as_index=False).count() fuit['attributes'] = 'price' fuit = pd.merge(fuit,avgP, on = ['ticker','year','month'], how='left') fuit = fuit.drop(['value'],axis=1) fuit.columns = ['ticker','year','month','attributes','value'] df = df.append(fuit)