def get_Gallup_country_lookups(verbose=True): """ Kosovo is the only GWP country not matched to a 3-letter ISO code. Let's ignore it. """ dfr = pd.read_table(__local_input_path__+'GallupWorldPoll-region-country.tsv').rename(columns={'country':'rcountry'}) dfr['lccountry'] = dfr.rcountry.str.lower() dfr = dfr.set_index('lccountry') dfw = pd.read_table(__local_input_path__+'GallupWorldPoll-WP5-defs-2016.tsv').rename(columns={'country':'wcountry'}) dfw['lccountry'] = dfw.wcountry.str.lower() dfw = dfw.set_index('lccountry') wp5s = pd.read_table(__local_input_path__ +'countrycode_main.tsv', skiprows=3).set_index('country_GWP3_wp5') wp5s = wp5s[['countryCode_GWP3_wp5', 'countryCode_ISO3','country_bestShortName','country_bestName','twoletter_AlexShultz_svg']] df= wp5s.join(dfr).join(dfw).rename(columns = {'countryCode_ISO3':'ISO',}) df.index.name = 'country' assert 'South Africa'.lower() in dfr.rcountry assert 'South Africa'.lower() in df.index # Now several checks: # Did regions get their ISO? problems = { ' Published WHR country lacks an ISO: ': df[pd.notnull(df.rcountry) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','rcountry']], ' Published WHR country lacks a WP5: ': df[pd.notnull(df.rcountry) & pd.isnull(df.WP5)], ' Published WHR country lacks a map code: ': df[pd.notnull(df.rcountry) & pd.isnull(df.twoletter_AlexShultz_svg)], ' Old Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.countryCode_GWP3_wp5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']], ' 2016 Gallup micro country lacks an ISO in my master lookup: ': df[pd.notnull(df.WP5) & pd.isnull(df.ISO)][['ISO','countryCode_GWP3_wp5','WP5','wcountry']], } if verbose: for tt,dd in problems.items(): if not dd.empty: print('\n\n -- country_tools WARNING: '+tt) print dd return df.reset_index()
def main(): parser = argparse.ArgumentParser(description="Extract fasta file.") parser.add_argument('-table1',nargs=1,type=str,help="First table.") parser.add_argument('-table2',nargs=1,type=str,help="Second table.") parser.add_argument('-table3',nargs=1,type=str,help="Third table.") args = parser.parse_args() #load tables table1 = pandas.read_table(args.table1[0]) table1.index = table1['Unnamed: 0'] table2 = pandas.read_table(args.table2[0]) table2.index = table2['Unnamed: 0'] table3 = pandas.read_table(args.table3[0]) table3.index = table3['Unnamed: 0'] print '\n' + args.table1[0] + '\n' print 'Number p-value <= 0.05: '+str(len(table1)) print 'Number FDR <= 0.05: '+str(sum(table1.FDR<=0.05))+'\n' print '\n' + args.table2[0] + '\n' print 'Number p-value <= 0.05: '+str(len(table2)) print 'Number FDR <= 0.05: '+str(sum(table2.FDR<=0.05))+'\n' print '\n' + args.table3[0] + '\n' print 'Number p-value <= 0.05: '+str(len(table3)) print 'Number FDR <= 0.05: '+str(sum(table3.FDR<=0.05))+'\n' set1 = sets.Set(table1.index) set2 = sets.Set(table2.index) set3 = sets.Set(table3.index) print 'Overlapping statistics'+'\n' print 'Intersection (p-value<=0.05)' print args.table1[0] + ' and ' + args.table2[0] + ': \n' + str(len(set1.intersection(set2))) print args.table1[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3))) print args.table3[0] + ' and ' + args.table2[0] + ': \n' + str(len(set3.intersection(set2))) print args.table1[0] + ' and ' + args.table2[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3.intersection(set2)))) table1sub = table1[table1.FDR<=0.05] table2sub = table2[table2.FDR<=0.05] table3sub = table3[table3.FDR<=0.05] set1 = sets.Set(table1sub.index) set2 = sets.Set(table2sub.index) set3 = sets.Set(table3sub.index) print '\n\nIntersection (FDR<=0.05)' print args.table1[0] + ' and ' + args.table2[0] + ': \n' + str(len(set1.intersection(set2))) print args.table1[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3))) print args.table3[0] + ' and ' + args.table2[0] + ': \n' + str(len(set3.intersection(set2))) print args.table1[0] + ' and ' + args.table2[0] + ' and ' + args.table3[0] + ': \n' + str(len(set1.intersection(set3.intersection(set2)))) #pdb.set_trace() sys.exit()
def main(): # define input/output options = get_options() # read summary stats file stats_file = options.stats stats = pd.read_table(stats_file, sep="\t") # read region file regions_file = options.regions regions = pd.read_table(regions_file, header=None, names=['index_snp','locus']) regions['chr'] = regions['locus'].str.extract('chr(.*):').astype(int) regions['start'] = regions['locus'].str.extract(':(.*)-').astype(int) regions['end'] = regions['locus'].str.extract('-(.*)').astype(int) # iterate through regions subset_list = [] for index, row in regions.iterrows(): subset = stats[np.logical_and(stats['CHROM'] == row['chr'], np.logical_and(stats['POS'] >= row['start'], stats['POS'] <= row['end']))] subset.is_copy = False subset['index_snp'] = row['index_snp'] subset_list.append(subset) # concatenate list of dataframes stats_subset = pd.concat(subset_list) # write exclusion file out_file = options.out_file stats_subset.to_csv(out_file, sep=' ', index=False)
def simple_expected_result(): melano = u"""Chromosome Bin chrX/ChIP_1_melanocyte.bed.gz chrX/ChIP_2_melanocyte.bed.gz chrX/Input_1_melanocyte.bed.gz chrX/Input_2_melanocyte.bed.gz Enriched_melanocyte chr1 200 0.0 0.0 0.0 0.0 0.0 chr1 400 0.0 0.0 0.0 0.0 0.0 chr1 600 0.0 0.0 0.0 0.0 1.0 chr1 800 0.0 2.0 0.0 0.0 1.0 chr1 1000 0.0 0.0 0.0 0.0 1.0 chr1 1200 13.0 128.0 2.0 2.0 1.0 chr1 1400 0.0 0.0 0.0 0.0 0.0 chr1 1600 0.0 0.0 0.0 0.0 0.0""" fibro = u"""Chromosome Bin chrX/ChIP_1_fibroblast.bed.gz chrX/ChIP_2_fibroblast.bed.gz chrX/Input_1_fibroblast.bed.gz chrX/Input_2_fibroblast.bed.gz Enriched_fibroblast chr1 200 0.0 0.0 0.0 0.0 1 chr1 400 0.0 0.0 0.0 0.0 1 chr1 600 0.0 0.0 0.0 0.0 1 chr1 800 0.0 2.0 0.0 0.0 1 chr1 1000 0.0 0.0 0.0 0.0 1 chr1 1200 13.0 128.0 2.0 2.0 1 chr1 1400 0.0 0.0 0.0 0.0 1 chr1 1600 0.0 0.0 0.0 0.0 1""" od = OrderedDict() od["melano"] = pd.read_table(StringIO(melano), sep="\s+", index_col=[0, 1]) od["fibro"] = pd.read_table(StringIO(fibro), sep="\s+", index_col=[0, 1]) return od
def train(sparkContext): Utils.logMessage("\nClassification model started") pd.read_table(pv.processedFile, sep=',',encoding='utf-8').to_csv(pv.processedFile, header=False, index=False,encoding='utf-8') truncatedAccounts = sparkContext.textFile(pv.processedFile).take(pv.truncateLineCount - 1) rawData = sparkContext.parallelize(truncatedAccounts).map(countByFeatures).map(lambda item: LabeledPoint(item[0], Vectors.dense(item[2:]))) trainWithParam(sparkContext, rawData, 0.7, 'entropy', 4, 16)
def __init__(self,args): if args.window_type not in ['BP','SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile) # af1 = self.get_allele_frequency(bed_1,args) # print(len(af1), "SNPs in file 1") snps_1 = (af1>args.maf)&(af1<1-args.maf) # print(np.sum(snps_1), "SNPs in file 1 after MAF filter") if (args.from_bp is not None) and (args.to_bp is not None): k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp) snps_1 = snps_1&k snps_to_use = bed_1.sid[snps_1] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) print(len(snps_to_use),"SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # pos = bed_1.pos[bed_1_index] # bim_1=pd.read_table(bed_1.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) af = af1[bed_1_index] # if args.afile is not None: a1 = pd.read_table(args.afile,header=None,sep='\s*', names=['id1','id2','theta']) else: a1 = None self.af = af self.M = len(bed_1_index) # self.windows = self.get_windows(pos,args) # self.chr = pos[:,0] self.pos = pos[:,2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
def runPyCombat(fl): """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """ print "Running Combat...", expr_input_dir = fl.ExpFile() pheno_dir = formatPhenoFile(fl) moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir) try: export.copyFile(expr_input_dir, moved_exp_dir) print "Moved original expression file to:" print "\t" + moved_exp_dir ### now overwrite the origin excluding the commented rows export.cleanFile(expr_input_dir, removeExtra="#") ### remove comments from the original file except Exception: None pheno = pa.read_table(pheno_dir, index_col=0) dat = pa.read_table(expr_input_dir, index_col=0) mod = patsy.dmatrix("group", pheno, return_type="dataframe") t = time.time() # print dat, pheno.batch, mod;sys.exit() ebat = combat(dat, pheno.batch, mod, 0) print "...Combat completed in %.2f seconds" % (time.time() - t) print "Original expression file over-written with batch effect removal results..." ebat.to_csv(expr_input_dir, sep="\t")
def runSharesPSRCToBKRZones(): #list of two lists files_shares = [files_manu_shares, file_wtcu_shares] header_rows = 3 #number of rows at the begining of a file with header information headers = {} #dictionary to save header information for files_group in files_shares: for file in files_group: print("working on file: " + file) file_path = os.path.join(wd, file) #read header - use "#" as seperator as it is less likely to present in the file headers[file] = pd.read_table(file_path, delimiter = "#", header = None, nrows = header_rows) # skip first few rows, as they contain general information - also ignore rows starting with 'c' (comment lines) shares_psrc = pd.read_table(file_path, delimiter = " ", names = ["o","d",file], comment = "c", skiprows = header_rows) if file == files_group[0]: #if first file in the group, set to the file shares truck_shares_psrc = shares_psrc else: #add a new column for a new file truck_shares_psrc = pd.merge(truck_shares_psrc, shares_psrc, on = ["o","d"]) # merge psrc to bkr correspondence with percent tazGroups = pd.merge(truck_shares_psrc, tazShares, left_on = "o", right_on = "psrc_zone_id") tazGroups[file] = tazGroups[file] * tazGroups["percent"] # group by unique pair of bkr zone and group tazGroups_grouped = tazGroups.groupby(["bkr_zone_id"]) # calculate sum of percent by unique pair tazGroups_sum = tazGroups_grouped[files_group].sum() tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1) for file in files_group: tazGroups_sum[file] *= 1/tazGroups_sum['sum'] tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1) tazGroups_sum = tazGroups_sum.round(4) #round values to 4 decimal #temp = tazGroups_sum.ix[tazGroups_sum["sum"]>1.0] #debug: to find out rows that have sum value more than 1 tazGroups_sum = tazGroups_sum[files_group].reset_index() # makes object a data frame by setting the current index to a column tazGroups_sum["c"] = "all:" for file in files_group: tazGroups_bkr = tazGroups_sum[["bkr_zone_id", "c", file]] tazGroups_bkr = tazGroups_bkr.sort_values(by = ['bkr_zone_id'], ascending=[True]) # write - first header and then append the updated data outfile = file.split(".")[0] outfile = os.path.join(wd, outfile + "_bkr.in") #first write header headers[file].to_csv(outfile, sep = " ", header = False, index = False, quoting=csv.QUOTE_NONE, escapechar = " ") #had to add space as escapechar otherwise throws an error - not sure if that would cause any issue in the mdoel #write data with open(outfile, 'a') as wfile: tazGroups_bkr.to_csv(wfile, sep = " " , header = False, index = False)
def biscorr(maxitems=10,fig=None,ax=None): '' from string import lstrip biscorr49 = pandas.read_table('/home/ewout/Dropbox/RIRT/dsc49-biscorr.csv',header=None,names=['Q','biscorr']) biscorr89 = pandas.read_table('/home/ewout/Dropbox/RIRT/dsc89-biscorr.csv',header=None,names=['Q','biscorr']) stripV = lambda s: lstrip(s,'V') biscorr49['Q'] = biscorr49['Q'].apply(stripV) biscorr89['Q'] = biscorr89['Q'].apply(stripV) biscorr49 = biscorr49.sort(columns='biscorr') biscorr89 = biscorr89.sort(columns='biscorr') biscorr49 = biscorr49[0:maxitems] biscorr89 = biscorr89[0:maxitems] if not fig: fig = plt.figure() if not ax: ax = fig.add_subplot(111) ax.set_title(u"Correlação biserial") fig,ax = orderedfig(biscorr89['Q'],biscorr89['biscorr'],biscorr49['Q'],biscorr49['biscorr'],maxitems,fig,ax) ax.set_xlabel(u"") ax.set_ylim(0,0.2) ax.set_xticks([]) ax.text(0.5,-0.1,u"Item",clip_on=False,transform = ax.transAxes,ha='center') return fig,ax
def Cleaning(): nanoClean = pd.read_table("nanoflex clean.txt") nanoClean.columns = ['V', 'I'] macroClean = pd.read_table("clean.txt") macroClean.columns = ['V', 'I'] return nanoClean, macroClean
def ChronoAmp(): CA_Nano = pd.read_table("chronoampnano.txt") CA_Nano.columns = ['t', 'I'] CA_Macro = pd.read_table("chronoamp macro.txt") CA_Macro.columns = ['t', 'I'] return CA_Nano, CA_Macro
def main(args): logging.info("Reading sample info") sample_info = pd.read_table(args.sample_info, header=None, index_col=0, names=['avg_read_len']) logging.info("Reading gene lengths") gene_lengths = pd.read_table(args.gene_lengths, header=None, index_col=0, names=['gene_id','gene_length']) df = pd.DataFrame() for fn, sample_name in zip(args.coverage_files, args.sample_names): logging.info("Calculating TPM for "+ sample_name) ## Read counts per gene for sample rg = pd.read_table(fn, index_col=0, header=None, names=['gene_id', 'count']) ## Intersect with genes in the gene length file rg = rg.loc[list(set(gene_lengths.index).intersection(set(rg.index)))] gene_lengths = gene_lengths.loc[list(rg.index)] ## Average read length for sample rl = sample_info.ix[sample_name,'avg_read_len'] ## Calculate T for sample T = rl * rg['count'].divide(gene_lengths['gene_length']).sum() ## Calculate TPM for sample tpm = ((1e6*rl)/float(T))*(rg['count'].divide(gene_lengths['gene_length'])) ## Create dataframe TPM = pd.DataFrame(tpm,columns=[sample_name]) ## Concatenate to results df = pd.concat([df,TPM],axis=1) ## Write to file df.to_csv(sys.stdout, sep='\t') logging.info("Done")
def Comparison(): nanoComp = pd.read_table("nanoflex_comparison.txt") nanoComp.columns = ['V', 'I'] macroComp = pd.read_table("macro_comparison.txt") macroComp.columns = ['V', 'I'] return nanoComp, macroComp
def main(args): # Import data logger.info("Importing Data") dat = pd.read_table(args.fname, comment="#") dat.set_index(args.uniqID, inplace=True) # Prepare Figure ## Title if args.title: title = args.title else: title = "{0} vs {1} vs {2}".format(args.x, args.y, args.z) fig = plt.figure(figsize=(8, 5)) ax = fig.add_subplot(111, projection="3d") fig.suptitle(title) if args.xlab: xlab = args.xlab else: xlab = args.x if args.ylab: ylab = args.ylab else: ylab = args.y if args.zlab: zlab = args.zlab else: zlab = args.z # Make plots if args.dname and args.group: # If group information give color by group. design = pd.read_table(args.dname) design.set_index("sampleID", inplace=True) merged = dat.join(design, how="left") grp = merged.groupby(args.group) cmap = getColors(grp.indices.keys()) for i, val in grp: c = cmap[i] xs = val[args.x] ys = val[args.y] zs = val[args.z] ax.scatter(xs, ys, zs, c=c, s=100, label=i) buildLegend(ax, cmap) else: # Else just plot. xs = dat[args.x] ys = dat[args.y] zs = dat[args.z] ax.scatter(xs, ys, zs, s=100) ax.set_xlabel(xlab) ax.set_ylabel(ylab) ax.set_zlabel(ylab) galaxySavefig(fig, args.fig)
def clean_import_scsnv(): """read each chr 1-22 and X/Y from the dbscSNV download into a dict of dataframes for further processing""" chrom_dict = {} cols = [0,1,2,3,16,17] col_names = ['CHROM', 'POS', 'REF', 'ALT', 'ada_score', 'rf_score'] for i in range(23): if i > 0: chrom_dict[str(i)] = pd.read_table('dbscSNV1.1.chr'+str(i), sep = '\t', na_values = '.', usecols=cols, names=col_names, header=0) chrom_dict.setdefault('X', pd.read_table('dbscSNV1.1.chrX', sep = '\t', na_values = '.', usecols=cols, names=col_names, header=0)) chrom_dict.setdefault('Y', pd.read_table('dbscSNV1.1.chrY', sep = '\t', na_values = '.', usecols=cols, names=col_names, header=0)) #for i in range(23): # if i > 0: # chrom_dict[str(i)] = pd.read_table('dbscSNV1.1.chr'+str(i), sep = '\t', # na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000) # #chrom_dict.setdefault('X', pd.read_table('dbscSNV1.1.chrX', sep = '\t', # na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000)) # #chrom_dict.setdefault('Y', pd.read_table('dbscSNV1.1.chrY', sep = '\t', # na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000)) # return chrom_dict
def _parse_table(f, hdr, dao_type): if dao_type is not None and dao_type.read_cols is not None: # limit number of read cols df = pd.read_table(f, header=None, sep='\s+', usecols=range(dao_type.read_cols)) else: df = pd.read_table(f, header=None, sep='\s+') #df.insert(0, 'id', df.index.to_series()) if dao_type is None: dao_type = _guess_filetype(hdr, df) if dao_type == DAO.AP_FILE: # two row per star format correction odd = df.iloc[0::2] odd.columns = DAO.AP_FILE_ODD.columns[:odd.columns.size] even = df.iloc[1::2] even.columns = DAO.AP_FILE_EVEN.columns[:even.columns.size] even.index = odd.index df = odd.join(even, rsuffix='foo') else: df.columns = dao_type.columns[:df.columns.size] df.id = df.id.astype(int) df.index = df.id # find NaN for col in df.columns: coltype = _get_col_type(dao_type.extension, col) if coltype.NaN: df[col].replace(coltype.NaN, pd.np.nan, inplace=True) ret = StarList(df) ret.DAO_type = dao_type return ret
def get_mutation_data(gene_list, cancer_subtypes): """retrieve case-level data on mutations for given list of genes and cell lines """ base_url = 'http://www.cbioportal.org/webservice.do' genes = ' '.join(gene_list) subtypes = ' '.join(['%s_tcga_mutations' % c.lower() for c in cancer_subtypes]) parameters = {'cmd': 'getMutationData', 'gene_list': genes, 'genetic_profile_id': subtypes} r = requests.get(base_url, params=parameters) urlData = r.content error_message = 'Error: Problem when identifying'\ 'a cancer study for the request.\n' if urlData == error_message: df = pd.read_table(io.StringIO(urlData.decode('utf-8'))) else: df = pd.read_table(io.StringIO(urlData.decode('utf-8')), header=1) df = df[['gene_symbol', 'case_id', 'mutation_type', 'genetic_profile_id']] df = df[~((df.gene_symbol == 'Mutations') | ( df.gene_symbol == 'gene_symbol'))] df = df.dropna() return df
def get_hist_and_rec(station_number): #create "artificial" wildcard path for historical data. For every station imaginable. histpath_temp = '/home/pythonproject/Weather/ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/daily/kl/historical/produkt_klima_Tageswerte_*' histpath_temp += str(station_number).zfill(5)+'.txt' #create "artificial" wildcard path for recent data. For the station we're looking at right now. recpath_temp = '/home/pythonproject/Weather/ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/daily/kl/recent/produkt_klima_Tageswerte_*' recpath_temp += str(station_number).zfill(5)+'.txt' #check if that path actually exists. Globglob checks if the histpath file actually exists. if len(glob.glob(histpath_temp)) != 0: #if file exists, save the path as a string to "histpath" variable. histpath = glob.glob(histpath_temp)[0] hist_ = pd.read_table(histpath, sep=";", low_memory=False) #is_hist = True else: #is_hist=False hist_ = [] #check if recent data exists if len(glob.glob(recpath_temp)) != 0: recpath = glob.glob(recpath_temp)[0] rec_ = pd.read_table(recpath, sep=";", low_memory=False) #is_rec = True else: #is_rec = False rec_ = [] return (hist_,rec_)
def main(): parser = argparse.ArgumentParser(description="Extract fasta file.") parser.add_argument('-DE',nargs=1,type=str,help="Table containing DE results.") parser.add_argument('-trinity',nargs=1,type=str,help="Trinity results.") parser.add_argument('-out',nargs=1,type=str,help="Out file.") args = parser.parse_args() #load tables DEResults = pandas.read_table(args.DE[0]) trinityResults = pandas.read_table(args.trinity[0]) #parse data temp = list(trinityResults['trans_derived']) temp = map(lambda x: x.split(':')[0],temp) trinityResults.index = temp filtered_trinityResults = trinityResults.ix[DEResults.index] topBlastHit = list(filtered_trinityResults['TopBlastHit']) uniprotID=[] for i in topBlastHit: if i is not '.': uniprotID.append(i.split('|')[1]) uniprotID = pandas.DataFrame(uniprotID) uniprotID.to_csv(args.out[0],sep='\n',header=False,index=False) sys.exit(0)
def test_pairwise(): train_pool = Pool(ZEN_TRAIN_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TRAIN_PAIRS_FILE) test_pool = Pool(ZEN_TEST_FILE, column_description=ZEN_CD_FILE, pairs=ZEN_TEST_PAIRS_FILE) model = CatBoost(params={'loss_function': 'PairLogit', 'random_seed': 0, 'iterations': 2, 'thread_count': 8}) model.fit(train_pool) pred1 = model.predict(test_pool) df = read_table(ZEN_TRAIN_FILE, delimiter='\t', header=None, dtype={12: str}) train_target = df.loc[:, 1] cat_features = range(13) train_data = df.drop([0, 1, 15], axis=1).astype(str) train_pairs = read_table(ZEN_TRAIN_PAIRS_FILE, delimiter='\t', header=None) df = read_table(ZEN_TEST_FILE, delimiter='\t', header=None, dtype={12: str}) test_data = df.drop([0, 1, 15], axis=1).astype(str) model.fit(train_data, train_target, cat_features, pairs=train_pairs) pred2 = model.predict(test_data) pairs_weight = np.ones(train_pairs.shape[0]) model.fit(train_data, train_target, cat_features, pairs=train_pairs, pairs_weight=pairs_weight) pred3 = model.predict(test_data) assert _check_data(pred1, pred2) assert _check_data(pred1, pred3)
def read_clinical_data(path, cancer): cancer = cancer.lower() na_vals = ['[Completed]', '[Not Available]', '[Not Applicable]', 'null'] pat = pd.read_table(path + 'clinical_patient_{}.txt'.format(cancer), index_col=0, skiprows=[0, 2], na_values=na_vals) f = pat.dropna(axis=1, how='all') for fu in os.listdir(path): if 'clinical_follow_up' not in fu: continue followup = pd.read_table(path + fu, index_col=0, skiprows=[0, 2], na_values=na_vals) f = pd.concat([f, followup]) f.columns = f.columns.map(lambda s: s.replace('_', '').lower()) time_vars = ['daystolastfollowup', 'daystolastknownalive', 'daystodeath'] time_cols = list(f.columns.intersection(time_vars)) # f['vitalstatus'] = f['vitalstatus'].map(lambda s: s in # ['DECEASED','Dead','deceased'], # # na_action='skip') f['vitalstatus'] = f['daystodeath'].isnull() f = f.sort(columns=['vitalstatus'] + time_cols, ascending=True) f = f.groupby(lambda s: s[:12], axis=0).last() return f
def main(args): '''Everything is defined here''' outdir = args.outdir run = os.path.abspath(outdir).split('/')[-1].split('virmet_output_')[1] try: os.chdir(outdir) except FileNotFoundError: sys.exit('Where is the output dir? Check the path.') sample_dirs = glob.glob('*_S*') all_reads = pd.DataFrame() all_orgs = pd.DataFrame() for sd in sample_dirs: # parse and save stat files stat_file = os.path.join(sd, 'stats.tsv') df = pd.read_table(stat_file, sep='\t', header=None, names=['category', 'reads']) df['sample'] = sd df['run'] = run all_reads = all_reads.append(df) # parse and save orgs_list files orgs_file = os.path.join(sd, 'orgs_list.tsv') df = pd.read_table(orgs_file, sep='\t', header=0) df['sample'] = sd df['run'] = run all_orgs = all_orgs.append(df) all_orgs.to_csv('orgs_species_found.tsv', sep='\t', index=False) all_reads.to_csv('run_reads_summary.tsv', sep='\t', index=False)
def get_tickers (): global _tickers if _tickers is not None: return _tickers url_NSDQ = "http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nasdaq&render=download" url_NYSE = "http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nyse&render=download" nsdq = pd.read_table(url_NSDQ,sep=",") nyse = pd.read_table(url_NYSE,sep=",") tickers = pd.concat([nsdq,nyse]) def dollar_to_int (dollar_string): try: parsed = int(float(dollar_string[1:-1])*1000) if dollar_string[-1] == 'B': parsed *= 1000 return parsed except: return np.NaN tickers = tickers.drop_duplicates("Name") tickers = tickers[["Symbol","MarketCap","Sector","industry"]] tickers.MarketCap = tickers.MarketCap.apply(dollar_to_int) tickers = tickers[np.isfinite(tickers.MarketCap)] _tickers = tickers.reset_index()[["Symbol","MarketCap","Sector","industry"]] return _tickers
def main(args): clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0) taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "bla", "bla1", "bla2"]) all_approved = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0) checkm_taxonomy = pd.read_table(args.checkm_taxonomy_file, index_col=0) all_approved_set = set(all_approved.index.values) unapproved_rrna = defaultdict(int) approved_rrna = {} levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] for rrna_contig in taxonomy_df.index.values: if rrna_contig in clustering.index: cluster_id = clustering.loc[rrna_contig]['cluster_id'] if cluster_id in all_approved_set: checkm_val = checkm_taxonomy.loc[cluster_id]['Taxonomy'].split(';') metaxa_val = taxonomy_df.loc[rrna_contig]['taxonomy'].split(';') metaxa_val = fix_strange_metaxa_vals(metaxa_val) matched_level = None for i, level in enumerate(levels): checkm_level_val, metaxa_level_val = None, None if len(checkm_val) > i and len(metaxa_val) > i: checkm_level_val = checkm_val[i][3:] metaxa_level_val = metaxa_val[i] if level == 'species': metaxa_level_val = metaxa_val[i].replace(' ', '_') if checkm_level_val == metaxa_level_val: matched_level = i else: break else: matched_level = i-1 break if cluster_id not in approved_rrna: approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0} if matched_level >= 3: approved_rrna[cluster_id]['matching'] += 1 else: approved_rrna[cluster_id]['not matching'] += 1 #print(most_detailed_level_checkm, most_detailed_level_metaxa) #print(most_detailed_matched_level) #print(taxonomy_df.loc[rrna_contig]['taxonomy'], checkm_taxonomy.loc[cluster_id]['Taxonomy']) else: unapproved_rrna[cluster_id] += 1 for cluster_id in all_approved_set: if cluster_id not in approved_rrna: approved_rrna[cluster_id] = {'matching': 0, 'not matching': 0} approved_stats_df = pd.DataFrame.from_dict(approved_rrna, orient='index') unapproved_stats_df = pd.DataFrame.from_dict(unapproved_rrna, orient='index') unapproved_stats_df.columns = ['nr_rrna'] print(approved_stats_df) print(unapproved_stats_df)
def threat_collect(): # Includes Malc0de, emerging threats and Zeus Tracker as examples url_malc0de = 'http://malc0de.com/bl/IP_Blacklist.txt' url_et = 'http://rules.emergingthreats.net/blockrules/compromised-ips.txt' url_zeus = 'https://zeustracker.abuse.ch/blocklist.php?download=ipblocklist' url_zeus_domains = 'https://zeustracker.abuse.ch/blocklist.php?download=domainblocklist' # Convert to DataFrames df_malc0de = pd.read_table(url_malc0de, index_col=None, skiprows=4, header=None, names=['actor']) df_et = pd.read_table(url_et, index_col=None, skiprows=0, header=None, names=['actor']) df_zeus = pd.read_table(url_zeus, index_col=None, skiprows=6, header=None, names=['actor']) df_zeus_domains = pd.read_table(url_zeus_domains, index_col=None, skiprows=6, header=None, names=['actor']) # Alternatively, put a bunch of threat intel CSVs in the "intel" directory # # Read all threat intel from intel folder # intel_path ='intel' # all = glob.glob(intel_path + "/*.csv") # ti_combine = pd.DataFrame() # ti_list_ = [] # for file_ in all: # new_frame = pd.read_csv(file_,index_col=None, header=0, names=['actor']) # ti_list_.append(new_frame) # ti_combine = pd.concat(ti_list_) # Combine dataframes ti_combine = pd.concat([df_malc0de, df_et, df_zeus, df_zeus_domains], axis=0) return ti_combine
def augmentOPUSfile(inputfile, mergefile, outputfile): '''augment an OPUS files with additional annotations, e.g. adding a column with segmented Sampa from Lexique to the French data, or segmented ''' iff = pandas.read_table(inputfile, encoding='utf-8').dropna() mff = pandas.read_table(mergefile, encoding='utf-8').dropna() iff_m = iff.merge(mff, left_on="word", right_on="word") iff_m.to_table(off, encoding='utf-8') #!!! keep with the same format
def mergeSingleExpressionTables(infile, outfile): ''' Merge refcoding and lncRNA count tables from a single condition if there are separate input reference gtfs. ''' file1 = infile[0] file2 = infile[1] tmpfile = P.getTempFilename(shared=True) df1 = pd.read_table(file1, sep="\t", index_col=0, header=0, compression="gzip") df2 = pd.read_table(file2, sep="\t", index_col=0, header=0, compression="gzip") out_frame = df1.append(df2) out_frame.to_csv(tmpfile, sep="\t") statement = '''cat %(tmpfile)s | gzip > %(outfile)s; rm -rf %(tmpfile)s''' P.run()
def count_ddd_trios(families_path, trios_path, diagnosed_path): """ count the male and female probands in the complete DDD trios Args: families_path: path to DDD family relationships file, in ped format, containing proband IDs and sex information trios_path: path to table of probands in complete trios. diagnosed_path: path to table of probands with diagnoses Returns: tuple of male and female proband counts. """ # load proband information, then select the proband who have exome sequence # available for both parents. families = pandas.read_table(families_path, sep="\t") trios = pandas.read_table(trios_path, sep="\t") proband_ids = trios["proband_stable_id"] probands = families[families["individual_id"].isin(proband_ids)] # get the number of trios studied in our data for each sex sex = probands["sex"].value_counts() male = sex[["M"]] female = sex[["F"]] if diagnosed_path is not None: # remove probands in DDD, unless we are not using the DDD probands. diagnosed = pandas.read_table(diagnosed_path, sep="\t") diagnosed = diagnosed[~diagnosed[["person_id", "sex"]].duplicated()] male -= sum(diagnosed["sex"].isin(["Male", "male", "M", "m"])) female -= sum(diagnosed["sex"].isin(["Female", "female", "F", "f"])) return (male, female)
def merge_for_appended(app_rep_path): #Kind of hack out the STEM and OUTPUT_DIR STEM = os.path.basename(app_rep_path).split('_naive_report_Appended')[0] OUTPUT_DIR = app_rep_path.split('/reports/')[0] naive_path = os.path.join(OUTPUT_DIR,'reports',STEM+'_naive_report.txt') glm_path = os.path.join(OUTPUT_DIR,'reports','glmReports',STEM+'_FUSION_W_ANOM_AND_INDEL_JUNCPOUT') appended_dir = os.path.join(OUTPUT_DIR,'reports','AppendedReports') if not os.path.exists(appended_dir): os.mkdir(appended_dir) appended_path = os.path.join(appended_dir,STEM+'_naive_report_Appended.txt') naive = pd.read_table(naive_path,sep='\t') glm = pd.read_table(glm_path,sep='\t') appended = pd.read_table(appended_path,sep='\t') #Rename the first naive column to match the first glm column naive.rename(columns={'@Junction':'junction'}, inplace=True) #Merge the two on their only shared column merged = pd.merge(naive,glm) #os.rename(app_rep_path,app_rep_path+'.old') out_path = STEM+'.txt.appended' merged.to_csv(out_path,sep='\t',index=False) return out_path
def clean_import_scsnv(): """read each chr 1-22 and X/Y from the dbscSNV download into a dict of dataframes for further processing""" chrom_dict = {} cols = [0, 1, 2, 3, 8, 16, 17] col_names = ["chr", "hg19_pos", "ref", "alt", "RefSeq_region", "ada_score", "rf_score"] for i in range(23): if i > 0: chrom_dict[str(i)] = pd.read_table( "dbscSNV1.1.chr" + str(i), sep="\t", na_values=".", usecols=cols, names=col_names, header=0 ) chrom_dict.setdefault( "X", pd.read_table("dbscSNV1.1.chrX", sep="\t", na_values=".", usecols=cols, names=col_names, header=0) ) chrom_dict.setdefault( "Y", pd.read_table("dbscSNV1.1.chrY", sep="\t", na_values=".", usecols=cols, names=col_names, header=0) ) # for i in range(23): # if i > 0: # chrom_dict[str(i)] = pd.read_table('dbscSNV1.1.chr'+str(i), sep = '\t', # na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000) # # chrom_dict.setdefault('X', pd.read_table('dbscSNV1.1.chrX', sep = '\t', # na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000)) # # chrom_dict.setdefault('Y', pd.read_table('dbscSNV1.1.chrY', sep = '\t', # na_values = '.', usecols=cols, names=col_names, header=0, nrows=10000)) return chrom_dict
import pandas as pd import numpy as np y_combined = pd.read_table('CancerTypes_y.txt', sep='\t', header=None) x_combined = pd.read_csv('Combined_processed.csv', header=0) from sklearn.model_selection import train_test_split, GridSearchCV from keras.models import Sequential from keras.layers import Dense, Dropout from keras.wrappers.scikit_learn import KerasClassifier import matplotlib.pyplot as plt #Split data into training and test sets x_combined_train, x_combined_test, y_combined_train, y_combined_test = train_test_split( x_combined, y_combined.values.flatten(), test_size=0.25, random_state=0) model = Sequential() model.add(Dense(512, activation='relu')) model.add(Dense(128, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_combined_train.values, y_combined_train, epochs=20, batch_size=128) print(model.evaluate(x_combined_test, y_combined_test)) print(model.metrics_names)
def rating_stantistics(inputpath, outputpath): cm_data_raw = pd.read_table(inputpath, sep=',', encoding='utf-8') #对user action data 进行统计处理。 newCust = pd.DataFrame(columns=[ "userid", 'totalrate', 'totalnumber', 'averate', 'numberof1', 'numberof2', 'numberof3', 'numberof367', 'numberof433', 'numberof4', 'numberof5', 'lowrate', 'highrate' ]) idlist = [] for i in range(0, len(cm_data_raw)): record = cm_data_raw.iloc[i] uid = record['userid'] if uid not in idlist: idlist.append(uid) udata = cm_data_raw[cm_data_raw['userid'] == uid] udata = sort(udata, ["orderid"], ascending=False) #记录总分数 totalrate = 0 totalnumber = 0 averate = 0 numberof5 = 0 numberof1 = 0 numberof2 = 0 numberof3 = 0 numberof367 = 0 numberof433 = 0 numberof4 = 0 #3分及其以下为low rate,求其数量 lowrate = 0 #4分及其以上为高分 highrate = 0 #对一个用户评分信息进行统计 for j in range(0, len(udata)): oudata = udata.iloc[j] totalrate = totalrate + oudata['rating'] totalnumber = totalnumber + 1 if j == 0: nr = oudata['rating'] #统计用户评分 if oudata['rating'] == 1: numberof1 = numberof1 + 1 lowrate = lowrate + 1 if oudata['rating'] == 2: numberof2 = numberof2 + 1 lowrate = lowrate + 1 if oudata['rating'] == 3: numberof3 = numberof3 + 1 lowrate = lowrate + 1 if oudata['rating'] == 3.67: numberof367 = numberof367 + 1 if oudata['rating'] == 4.33: numberof367 = numberof433 + 1 if oudata['rating'] == 4: numberof4 = numberof4 + 1 if oudata['rating'] == 5: numberof5 = numberof5 + 1 averate = totalrate / totalnumber lowrate = numberof1 + numberof2 + numberof3 highrate = numberof4 + numberof5 + numberof433 finalud = { "userid": uid, 'totalrate': totalrate, 'totalnumber': totalnumber, 'averate': averate, 'numberof1': numberof1, 'numberof2': numberof2, 'numberof3': numberof3, 'numberof367': numberof367, 'numberof433': numberof433, 'numberof4': numberof4, 'numberof5': numberof5, 'lowrate': lowrate, 'highrate': highrate } newCust = newCust.append(finalud, ignore_index=True) newCust.to_csv(outputpath) return
detail['dishes_name'].describe()) ############################################################################### ####################### 任务实现 ####################### ############################################################################### # 代码 4-38 from sqlalchemy import create_engine import pandas as pd engine = create_engine('mysql+pymysql://root:[email protected]:\ 3306/testdb?charset=utf8') detail = pd.read_sql_table('meal_order_detail1', con = engine) order = pd.read_table('../data/meal_order_info.csv', sep = ',',encoding = 'gbk') user = pd.read_excel('../data/users.xlsx') print('订单详情表的维度为:', detail.ndim) print('订单信息表的维度为:', order.ndim) print('客户信息表的维度为:', user.ndim) print('订单详情表的形状为:', detail.shape) print('订单信息表的形状为:', order.shape) print('客户信息表的形状为:', user.shape) print('订单详情表的元素个数为:', detail.size) print('订单信息表的元素个数为:', order.size) print('客户信息表的元素个数为:', user.size) # 代码 4-39
def read_table(*args, **kwargs): return pd.read_table(*args, **kwargs)
# %% # ** MODIFY ** # Set the file name and path to where you have stored the data filename = 'streamflow_week4.txt' filepath = os.path.join('data', filename) print(os.getcwd()) print(filepath) # %% # DON'T change this part -- this creates the lists you # should use for the rest of the assignment # no need to worry about how this is being done now we will cover # this in later sections. #Read the data into a pandas dataframe data=pd.read_table(filepath, sep = '\t', skiprows=30, names=['agency_cd', 'site_no', 'datetime', 'flow', 'code'] ) # Expand the dates to year month day data[["year", "month", "day"]] =data["datetime"].str.split("-", expand=True) data['year'] = data['year'].astype(int) data['month'] = data['month'].astype(int) data['day'] = data['day'].astype(int) # Make a numpy array of this data flow_data = data[['year', 'month','day', 'flow']].to_numpy() # Getting rid of the pandas dataframe since we wont be using it this week del(data) # Jill Question Answering Code #%%
import matplotlib import matplotlib.pyplot as plt from matplotlib.colors import BoundaryNorm from matplotlib.ticker import MaxNLocator import numpy as np import pandas as pd from scipy.spatial import KDTree from scipy.stats import gaussian_kde from scipy.interpolate import Rbf #x=np.loadtxt('', usecols=(0)) #y=np.loadtxt('', usecols=(1)) #z=np.loadtxt('', usecols=(2)) df_points = pd.read_table("wdata.dat", sep="\s+", usecols=[0, 1, 3], header=None) df_points.columns = ['vx', 'vy', 'Nex'] levels = MaxNLocator(nbins=15).tick_values(df_points.Nex.min(), df_points.Nex.max()) # pick the desired colormap, sensible levels, and define a normalization # instance which takes data values and translates those into levels. cmap = plt.get_cmap('RdBu') #cmap = plt.get_cmap('seismic') normal = BoundaryNorm(levels, ncolors=cmap.N, clip=True) GSIZE = 1000 X, Y = np.mgrid[df_points.vx.min():df_points.vx.max():GSIZE * 1j, df_points.vy.min():df_points.vy.max():GSIZE * 1j]
df_input = pd.read_excel(input_file) #Storing UniProt as an easier to type variable 'u' u = UniProt() #Using a built-in UniProt method to create a data frame containg everything in UniProt df_uniprot = u.get_df("organism:9606+and+reviewed:yes") #Rename the common column to match the inputted column df_uniprot.rename(columns={'Gene names (primary )':'Gene Symbol', 'Entry':'UniProt Symbol', 'Proteomes':'Chromosome Number', 'Length':'Protein Length'}, inplace = True) #Selecting columns I think are interesting df1_uniprot = df_uniprot[['UniProt Symbol', 'Gene names', 'Gene Symbol', 'Protein names', 'Chromosome Number', 'Sequence', 'Protein Length', 'Function [CC]', 'Gene ontology (GO)', 'Gene ontology (biological process)', 'Gene ontology (molecular function)', 'Gene ontology (cellular component)', 'Protein families']] #Converting the NCBI gene list to a pandas data frame df_ncbi = pd.read_table('NCBI_GeneID_File.txt') #Merge the data frames on the UniProt df_merged = df_input.merge(df1_uniprot, how='outer', on = 'Gene Symbol') df_merged2 = df_merged.merge(df_ncbi, how = 'outer', on = 'Gene Symbol') #Writing the data frame to an Excel file out_file = pd.ExcelWriter('CRISPR_Uniprot_NCBI_DataFrame3.xlsx', engine = 'xlsxwriter') df_merged2.to_excel(out_file) out_file.close()
def actual_data(): with open("data/actual_data.dat") as data: read_data = pd.read_table(data) return read_data
def invalid_data(): with open("data/invalid_data.dat") as data: read_data = pd.read_table(data) return read_data
parser.add_argument('--model_file', '-mf', type=str, default='stupidvae.pkl', help='Save model filename') parser.add_argument('--init_stdev', '-sd', type=float, default=0.01, help='Weight init stdev') args = parser.parse_args() expn_pth = '/n/data_02/Basset/data/expn/roadmap/57epigenomes.RPKM.pc' print("Reading gene expression data from:\n{}".format(expn_pth)) # Gene expression dataset expn = pd.read_table(expn_pth, header=0) col_names = expn.columns.values[1:] expn = expn.drop(col_names[-1], axis=1) # 19795*57 right now # TODO: is this all right? expn.columns = col_names pinned_lookup = torch.nn.Embedding.from_pretrained(torch.FloatTensor( expn.as_matrix().T[1:]), freeze=True) # [1:] is new! pinned_lookup.cuda() torch.manual_seed(3435) imgs = torch.poisson(pinned_lookup.weight) # discretize data # imgs = pinned_lookup.weight.round() # imgs = pinned_lookup.weight dat = torch.utils.data.TensorDataset(imgs, torch.zeros( 56, 1)) # placeholder arg required pytorch <0.4.0...
'peso': 'float64', 'duration': 'O', 'carteira_a_mercado': 'O', 'numero_operacoes': 'float64', 'quant_negociada_titulos': 'float64', 'valor_negociado': 'float64', 'pmr': 'O', 'convexidade': 'float64', 'yield': 'float64', 'redemption_yield': 'float64' } nomes_validos = list(valid_dtypes.keys()) # lista de user agents uas = pd.read_table('input/user-agents.txt',names=['ua'],skiprows=4,squeeze=True) # lista de feriados anbima fer = pd.read_excel('input/feriados_nacionais.xls',skipfooter=9, usecols=['Data'], parse_dates=['Data'], squeeze=True) bday = pd.offsets.CDay(holidays=fer) def get_indices_anbima(dt, wait=True): """ dt: str '%d/%m/%Y' ou dt obj """ if wait: if isinstance(wait,bool): wait = random.randint(1,3) sleep(wait) headers = {"User-Agent": np.random.choice(uas)}
def control_data(): with open("data/control_data.dat") as data: read_data = pd.read_table(data) return read_data
def load(cls, path, prefix, network=None): r""" Load data from the \'dat\' files located in specified folder. Parameters ---------- path : string The full path to the folder containing the set of \'dat\' files. prefix : string The file name prefix on each file. The data files are stored as \<prefix\>_node1.dat. network : OpenPNM Network Object If given then the data will be loaded on it and returned. If not given, a Network will be created and returned. Returns ------- An OpenPNM Project containing a GenericNetwork holding all the data """ net = {} # --------------------------------------------------------------------- # Parse the link1 file path = Path(path) filename = Path(path.resolve(), prefix+'_link1.dat') with open(filename, mode='r') as f: link1 = read_table(filepath_or_buffer=f, header=None, skiprows=1, sep=' ', skipinitialspace=True, index_col=0) link1.columns = ['throat.pore1', 'throat.pore2', 'throat.radius', 'throat.shape_factor', 'throat.total_length'] # Add link1 props to net net['throat.conns'] = sp.vstack((link1['throat.pore1']-1, link1['throat.pore2']-1)).T net['throat.conns'] = sp.sort(net['throat.conns'], axis=1) net['throat.radius'] = sp.array(link1['throat.radius']) net['throat.shape_factor'] = sp.array(link1['throat.shape_factor']) net['throat.total_length'] = sp.array(link1['throat.total_length']) # --------------------------------------------------------------------- filename = Path(path.resolve(), prefix+'_link2.dat') with open(filename, mode='r') as f: link2 = read_table(filepath_or_buffer=f, header=None, sep=' ', skipinitialspace=True, index_col=0) link2.columns = ['throat.pore1', 'throat.pore2', 'throat.pore1_length', 'throat.pore2_length', 'throat.length', 'throat.volume', 'throat.clay_volume'] # Add link2 props to net cl_t = sp.array(link2['throat.length']) net['throat.length'] = cl_t net['throat.conduit_lengths.throat'] = cl_t net['throat.volume'] = sp.array(link2['throat.volume']) cl_p1 = sp.array(link2['throat.pore1_length']) net['throat.conduit_lengths.pore1'] = cl_p1 cl_p2 = sp.array(link2['throat.pore2_length']) net['throat.conduit_lengths.pore2'] = cl_p2 net['throat.clay_volume'] = sp.array(link2['throat.clay_volume']) # --------------------------------------------------------------------- # Parse the node1 file filename = Path(path.resolve(), prefix+'_node1.dat') with open(filename, mode='r') as f: row_0 = f.readline().split() num_lines = int(row_0[0]) array = sp.ndarray([num_lines, 6]) for i in range(num_lines): row = f.readline()\ .replace('\t', ' ').replace('\n', ' ').split() array[i, :] = row[0:6] node1 = DataFrame(array[:, [1, 2, 3, 4]]) node1.columns = ['pore.x_coord', 'pore.y_coord', 'pore.z_coord', 'pore.coordination_number'] # Add node1 props to net net['pore.coords'] = sp.vstack((node1['pore.x_coord'], node1['pore.y_coord'], node1['pore.z_coord'])).T # --------------------------------------------------------------------- # Parse the node1 file filename = Path(path.resolve(), prefix+'_node2.dat') with open(filename, mode='r') as f: node2 = read_table(filepath_or_buffer=f, header=None, sep=' ', skipinitialspace=True, index_col=0) node2.columns = ['pore.volume', 'pore.radius', 'pore.shape_factor', 'pore.clay_volume'] # Add node2 props to net net['pore.volume'] = sp.array(node2['pore.volume']) net['pore.radius'] = sp.array(node2['pore.radius']) net['pore.shape_factor'] = sp.array(node2['pore.shape_factor']) net['pore.clay_volume'] = sp.array(node2['pore.clay_volume']) net['throat.area'] = ((net['throat.radius']**2) / (4.0*net['throat.shape_factor'])) net['pore.area'] = ((net['pore.radius']**2) / (4.0*net['pore.shape_factor'])) if network is None: network = GenericNetwork() network = cls._update_network(network=network, net=net) # Use OpenPNM Tools to clean up network # Trim throats connected to 'inlet' or 'outlet' reservoirs trim1 = sp.where(sp.any(net['throat.conns'] == -1, axis=1))[0] # Apply 'outlet' label to these pores outlets = network['throat.conns'][trim1, 1] network['pore.outlets'] = False network['pore.outlets'][outlets] = True trim2 = sp.where(sp.any(net['throat.conns'] == -2, axis=1))[0] # Apply 'inlet' label to these pores inlets = network['throat.conns'][trim2, 1] network['pore.inlets'] = False network['pore.inlets'][inlets] = True # Now trim the throats to_trim = sp.hstack([trim1, trim2]) trim(network=network, throats=to_trim) return network.project
import matplotlib.pyplot as plt import pandas as pd import seaborn as sns Z = pd.read_table("../hotspot/hotspot_pairs_z.txt.gz", index_col=0) hs_results = pd.read_table("../hotspot/hotspot.txt", index_col=0) # Cluster things! # to_drop = ['mt-Rnr1', 'mt-Rnr2'] # Z = Z.drop(to_drop, axis=1).drop(to_drop, axis=0) sns.clustermap(Z, vmin=-2, vmax=2, metric='correlation', yticklabels=True, method='average') plt.show() # Now cluster and divide from scipy.cluster.hierarchy import linkage, fcluster, dendrogram def sort_clusters(cl): map_fun = { old_i: new_i + 1 for new_i, old_i in enumerate(cl.value_counts().index) } cl = cl.map(map_fun)
# BUILD STATION DATABASE stas = ",".join([sta._code for sta in inventory[0].stations]) stdb_out = BNG_out+'sta_list' !{path2envbin+'query_fdsn_stdb.py'} -N {network} -C {compstr} -S {stas} {stdb_out} # %% codecell # Perform BNG analysis stdb_pkl = stdb_out+'.pkl' !{path2envbin+'bng_calc_auto'} --times=-5.,15. --window=60. --bp=0.04,0.1 --min-mag={minmagnitude} --min-dist={mindist} --save-location {BNG_out} {stdb_pkl} # %% codecell # Plot BNG output and save !{path2envbin+'bng_average'} --load-location {BNG_out} --plot --save {stdb_pkl} # !{path2envbin+'bng_average'} --load-location {BNG_out} {stdb_pkl} # %% codecell # Combine all measurements into single file pathlist = sorted(Path(BNG_out).glob('*/orientation_bng.txt')) file = open(BNG_out+'/orientations_BNG.txt', 'w') file.write("%8s %10s %10s %5s\n" % ('sta', 'phi', 'err', 'num')) for path in pathlist: data = pd.read_table(path, delim_whitespace=True) sta = data.sta[0] phi = data.phi[0] err = data.err[0] num = data.num[0] file.write("%8s %10f %10f %5d\n" % (sta, phi, err, num)) file.close()
# In[1]: import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns # ### 各遺伝子について、exon領域を “start-end,start-end,…”と出力する処理を実装。NIPBLで例示。 # In[2]: UCSC = pd.read_table("refFlat.hg38.txt", skiprows=1, names=('geneName', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds')) UCSC # そのまま読み込むと、はじめのカラムが#geneNameという名前になってしまう。 # そこで、header部分は読み込まず、namesを使って自分で付けることにした。 # In[3]: # "geneName" 列の値が NIPBLと一致する行をdfとして取得。 # .copy()にしないと、attribute errorとなる。 NIPBL_UCSC_df = UCSC.query("geneName == 'NIPBL'").copy() NIPBL_UCSC_df # cdsEndの異なる二種類が表示される
def run_model(src, dst, clinopyroxene=False): with app.app_context(): data = sample_ree(normalized=True, mode='cpx' if clinopyroxene else 'whole_rock') colors = sample_colors() model = DepletionModel(src) if clinopyroxene: depleted = model.fit_HREE(data, table='clinopyroxene_0 trace') else: depleted = model.fit_HREE(data) enrichment, multiplier = model.enrichment(data, depleted) # Create primitive-mantle normalized dataset Sun_PM = get_melts_data('literature/Sun_McDonough_PM.melts') PM_trace = Sun_PM.trace.ix[:, 0] # Add NMORB NMORB = get_melts_data('literature/NMORB_trace.melts') NMORB_trace = ree_only(NMORB.trace.transpose() / PM_trace) # Alkali basalt alkali = read_table('literature/Farmer_1995-Alkali-basalt.txt', comment="#", index_col=0) alkali /= PM_trace alkali_trace = ree_only(alkali) vals = [element(i) for i in data.columns] d = ree_only(depleted) grid = dict(height_ratios=(4.5, 1), hspace=0.1, right=0.99, left=0.16) fig, (ax1, ax2) = subplots(2, 1, figsize=(3.5, 6), gridspec_kw=grid) def create_main_axis(ax): for i, row in d.iterrows(): c = colors.ix[row.name][0] # Plot real data series = data.ix[row.name] u = series.map(lambda x: x.n) s = series.map(lambda x: x.s) ax.fill_between(vals, u - s, u + s, facecolor=c, edgecolor='none', alpha=0.2) def plot(name, x, y, **kwargs): if i == 'CK-3': kwargs['label'] = name else: kwargs['label'] = "" p = ax.plot(x, y, color=c, **kwargs) if clinopyroxene: s = 'clinopyroxene' else: s = 'whole-rock' plot('Measured ' + s, vals, u) # Plot calculated best fit plot("Modeled depleted", d.columns, row, linestyle='--', linewidth=1) v = enrichment.ix[row.name] if i == 'CK-2': # Don't include CK-2 because it isn't depleted, so results are spurious. continue plot("Enriching melt", d.columns, v, linestyle=':', linewidth=1) # Plot NMORB ax.fill_between(NMORB_trace.columns, NMORB_trace.ix[0, :], NMORB_trace.ix[0, :] - 0.5, color='#bbbbbb', linewidth=1.5, zorder=-5, label="") ax.fill_between(alkali_trace.columns, alkali_trace.min(), alkali_trace.max(), facecolor='#dddddd', edgecolor='none', zorder=-10, label="") ax.set_ylim(.01, 100) ax.set_xlim(element('La') - 0.1, element('Lu')) ax.yaxis.set_ticklabels( ["{:g}".format(v) for v in ax.yaxis.get_ticklocs()]) ax.set_ylabel("Rare-earth element abundance / Primitive Mantle") ax.xaxis.set_ticks(vals) ax.xaxis.set_ticklabels(data.columns) ax.set_yscale('log') ax.text(element('Ce') - 0.5, 40, "Alkali basalt", rotation=-28, color='#888888') ax.text(element('La'), 5, "NMORB", rotation=15, color='#888888') legend = ax.legend(loc="upper right") fr = legend.get_frame() fr.set_lw(0.5) create_main_axis(ax1) update_axes(ax1) fig.subplots_adjust(top=0.99, right=0.99) ree_scatter(ax2, model, data, colors) ax2.set_ylim([0, 1.2]) ax2.set_xlabel(r'HREE depletion degrees (%)') ax2.set_ylabel("Enriching fluid\nassimilated (%)") ax2.yaxis.set_label_coords(-0.1, 0.22) update_axes(ax2) axis_labels(ax1, ax2, pad=.16, fontsize=14) fig.savefig(dst, bbox_inches='tight')
import pandas as pd import os, sys if len(sys.argv) != 2: print('Error: No task specified') print('e.g. separate-files 2_back_vs_0_back') sys.exit(1) if not os.path.exists('subjects'): os.makedirs('subjects') for run in {'1', '2'}: subjectFile = 'taskBOLD_{0}_run_{1}-rh.csv'.format(sys.argv[1], run) print('Reading {0}'.format(subjectFile)) df = pd.read_table(subjectFile, header=0, sep=',', index_col=0) nSubjects = len(df.index) print('{0} subjects'.format(nSubjects)) f = open('subjects-{0}.txt'.format(run), 'w') for index in range(0, nSubjects): subjectID = 'NDAR_' + df.iloc[index, 0][5:16] outFile = 'subjects/' + subjectID + '-' + run + '.dscalar.nii' cmd = 'wb_command -cifti-merge {0} -cifti {1}_run{2}_sm5.dscalar.nii -column {3}'.format( outFile, sys.argv[1], run, index + 1) print(cmd) os.system(cmd)
columnsToEncode = list(df.select_dtypes(include=['category', 'object'])) le = LabelEncoder() for feature in columnsToEncode: try: df[feature] = le.fit_transform(df[feature]) except: print('Error encoding ' + feature) return df df = pd.read_table( "german.data", header=None, sep=' ', names=[ 'chkngAcctStatus', 'durationMonths', 'creditHistory', 'loanPurpose', 'creditAmount', 'savingsTotal', 'crrntEmplmtSince', 'instllmtPct', 'persnlStatus', 'othrDebtorGuaranters', 'crrntResidenceSince', 'propertyType', 'age', 'otherInstllmtType', 'housingType', 'existingCredits', 'jobStatus', 'numDependents', 'registeredPhone', 'foriegnWorker', 'goodBad' ]) df['goodBad'] = df["goodBad"] - 1 dfPred = encodeFeatures(df) predictors = dfPred.drop('goodBad', 1) targets = df['goodBad'] np.random.seed(123)
# 计算MovieLens 100k数据集中男性女性用户评分的标准差并输出 import pandas as pd unames = ['user_id', 'age', 'gender', 'occupation', 'zip_code'] users = pd.read_table('ml-100k/u.user', sep='|', header=None, names=unames, engine='python') rnames = ['user_id', 'item_id', 'rating', 'timestamp'] ratings = pd.read_table('ml-100k/u.data', sep='\t', header=None, names=rnames, engine='python') data = pd.merge(users, ratings) mean_ratings = data.pivot_table('rating', index=['user_id', 'gender'], aggfunc='mean') std = mean_ratings.groupby('gender').std() print(std) # mean_ratings = data.pivot_table('rating',index='user_id',columns = 'gender',aggfunc='mean') # female_ratings = mean_ratings['F'] # female_ratings_std = female_ratings.std() # # male_ratings = mean_ratings['M'] # male_ratings_std = male_ratings.std() # # print('Gender') # print('M %.2f' % male_ratings_std)
def read_table(self, *args, **kwargs): kwargs = self.update_kwargs(kwargs) return read_table(*args, **kwargs)
import pandas as pd data_frame = pd.read_table("popular-names.txt", header=None) print(data_frame.sort_values(2, ascending=False))
# Import data_processing from numpy_regressor.data_processing import DataProcessing # Import Pandas import pandas as pd # Import Bokeh import bokeh from bokeh.plotting import figure, show from bokeh.palettes import d3 # Getting the data from uci data repo airfoil_df = pd.read_table( filepath_or_buffer= "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat", names=[ "Frequency", "Angle of attack", "Chord length", "Free-stream velocity", "Suction side displacement", "Scaled sound pressure" ]) # Creating class objects airfoil_regressor = Regression() airfoil_data_process = DataProcessing() # Splitting up train and test set airfoil_df_train, airfoil_df_test = airfoil_data_process.train_test_split( airfoil_df) # Calling tbe regression function to get the prediction prediction = airfoil_regressor.my_regression(airfoil_df_train, airfoil_df_test.iloc[:, 0:-1], 1)
import scipy.sparse as sp import numpy as np import itertools as it from sklearn.feature_extraction.text import TfidfTransformer from sklearn.metrics.pairwise import cosine_similarity from statistics import mean from time import time, ctime from math import ceil import warnings warnings.filterwarnings("ignore") data_path = "~/Desktop/" add_path = "C:\\Users\\Kyle\\OneDrive\\Documents\\GMU Classes\\CS 584\\HW4_Jackson_Truong\\data\\" #reads in all the data as matrices, we only use test, train, and movie_tag, though test_array = pd.read_table(add_path+"test.dat", skip_blank_lines=False, \ delim_whitespace=True).as_matrix() train_array = pd.read_table(add_path+"train.dat", skip_blank_lines=False, \ delim_whitespace=True).as_matrix() genre_array = pd.read_table(add_path+"movie_genres.dat", skip_blank_lines=False, \ delim_whitespace=True).as_matrix() movie_tag_array = pd.read_table(add_path+"movie_tags.dat", \ skip_blank_lines=False).as_matrix() actor_array = pd.read_table(add_path+"movie_actors.dat", \ skip_blank_lines=False).as_matrix() actor_array = np.delete(actor_array, 2, 1) director_array = pd.read_table(add_path+"movie_directors.dat", \ skip_blank_lines=False).as_matrix()[:,0:2] #%% #this section forms dicts to reindex the various IDs, to reduce dimensionality
def validateFacilityData(facility, src): ocfg = FACMETADATA[facility] oopts = ocfg.get('options', {}) enc, sep = oopts.get('enc', DEF_ENCODING), oopts.get('sep', DEF_SEP) #if not osp.exists(src): # raise FileNotFoundError('input file %s not found - nothing to check' % src) try: df = pd.read_csv(src, encoding = enc, sep = sep) #except FileNotFoundError: # we tested that already... # raise FileNotFoundError('input file %s not found - nothing to check' % src) except: try: df = pd.read_table(src, encoding = enc, sep = sep, compression = 'infer') except: raise IOError("Impossible to load source data - format not recognised") oindex = ocfg.get('index',{}).copy() nindex = [col.get('name') for col in oindex.values()] try: columns = set(list(df.columns)).difference(set(nindex)) assert columns == set() except AssertionError: raise IOError("Unknown column present in the dataframe: '%s'" % list(columns)) else: try: columns = set(list(nindex)).difference(set(df.columns)) assert columns == set() except AssertionError: logging.warning("\n! Missing columns in source file: '%s' !" % list(columns)) nindex = {col.get('name'): col for col in oindex.values()} for col in df.columns: # check missing values try: assert df[col].isnull().any() is np.bool_(False) except AssertionError: try: assert df[col].isnull().all() is np.bool_(False) except AssertionError: logging.warning("\n! Column '%s' empty - missing values only !" % col) continue else: # logging.warning("\n! No missing values in column '%s' !" % col) pass # check type dtype = nindex[col].get('type') if dtype == 'str': pass # elif dtype is not None: try: assert df[col].dtype==object or df[col].dtype in Type.pytname2npt(dtype) # and dtype != object except AssertionError: logging.warning("\n! Unexpected type '%s' for column '%s' !" % (df[col].dtype,col)) # check values/format dfmt = values = nindex[col].get('values') if values is not None: # check values range if dtype == "datetime": # check date format try: pd.to_datetime(df[col], format=dfmt, errors='coerce').notnull().all() is True except AssertionError: logging.warning("\n! Unexpected date format for column '%s' !" % col) else: try: values = [values,] if not isinstance(values, Sequence) else values assert df[col].dropna().isin(values).all() except AssertionError: raise IOError("Wrong input values in column '%s'" % col) # check id uniquiness try: # note the use of INDEX here, not nindex, though the names end up being # the same assert df[oindex.get('id',{})['name']].dropna().is_unique is True except AssertionError: raise IOError("Duplicated identifier IDs") # check geographical coordinates for lL in ['lat','lon']: col = oindex.get(lL,{})['name'] if col in df.columns: try: assert (df[col] .dropna() .between(MINMAX_LL[lL][0],MINMAX_LL[lL][1]) .all()) is np.bool_(True) except AssertionError: raise IOError("Wrong input values for %s geographical coordinate '%s'" % lL)
Created on Mon Aug 19 22:07:40 2019 @author: USER """ # HEART dISEASE DIAGONOSIS # Using ANN classifier # Importing the libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt # Data Preprocessing dataset = pd.read_table('processed.cleveland.data', sep=',', header=None) X = dataset.iloc[:, :-1] y_class = dataset.iloc[:, -1] y = [item > 0 for item in y_class] # Replacing missing values with most frequent one X[11].value_counts() X[11] = X[11].map({'?': 0, '1.0': 1.0, '2.0': 2.0, '3.0': 3.0, '0.0': 0.0}) X[12] = X[12].map({'6.0': 6.0, '3.0': 3.0, '7.0': 7.0, '?': 3.0}) X = X.values # Handling categorical Variables
import sys sys.stderr = open(snakemake.log[0], "w") import common import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns calls = pd.read_table(snakemake.input[0], header=[0, 1]) samples = [name for name in calls.columns.levels[0] if name != "VARIANT"] sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False) sample_info = sample_info.rename(columns={"level_1": "sample"}) sample_info = sample_info[sample_info["DP"] > 0] sample_info["freq"] = sample_info["AD"] / sample_info["DP"] sample_info.index = np.arange(sample_info.shape[0]) plt.figure() sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True) plt.ylabel("allele frequency") plt.xticks(rotation="vertical") plt.savefig(snakemake.output.freqs) plt.figure()
cluster_disease = dict() df = pd.read_csv('../module/NG_network_module_Q0.3200458.csv') list1 = df.values.tolist() for each in list1: disease, name, category, cluster = each if cluster not in cluster_disease: cluster_disease[cluster] = set() cluster_disease[cluster].add(disease) if disease == 'M07': joint_mascular_neur_spine_cluster = cluster if disease == 'K80': hepatobiliary_cluster = cluster multimorbidity_pathway = dict() df = pd.read_table('../overlap/multimorbidity_pathway.txt') list1 = df.values.tolist() for each in list1: multimorbidity_pathway[(each[0], each[1])] = set(each[4].split(';')) print( '# ------------------ Joint-Muscular-Neurological-Spine -------------- #') list1 = list() for each in multimorbidity_pathway: if ('E66' in each) & (len( set(each) & cluster_disease[joint_mascular_neur_spine_cluster]) != 0): list1 += list(multimorbidity_pathway[each]) for each in list1: print(each)
def network(request): error_message = "" jump_div = "" # Option 1: List of Ensembl IDs if "option1" in request.POST: input_query = [] for element in request.POST['input'].split('\n'): element = element.strip() if element: input_query.append(element) input_query = list(set(input_query)) # max input IDs if 2000 > len(input_query) > 1: if input_query[0][0:4] == 'ENSG' or input_query[0][ 0:4] == 'ENST' or input_query[0][0:4] == 'ENSP': job_num = str(random.randrange(500)) with open(f'{jobs_path}/{job_num}.txt', "wb") as fp: # Pickling pickle.dump(input_query, fp) return redirect(Multi_proteins, job=job_num) # Option 2: Upload file if "option2" in request.POST and 'gene-count-file' in request.FILES: error_message_suffix = "" try: # --- Check input file for correct format # Try to decode as UTF-8, sanitize and parse as table try: file_string = escape( request.FILES['gene-count-file'].read().decode('UTF-8')) file_buffer = StringIO(file_string) # Parse as pandas dataframe transcript_count_df = pd.read_table(file_buffer) except UnicodeDecodeError: error_message_suffix = "could not be parsed as an text file" raise RuntimeError except ParserError: error_message_suffix = f"could not be parsed as an table file (CSV or TSV)" raise RuntimeError # Check input shape if transcript_count_df.shape[0] < 2 or transcript_count_df.shape[ 1] < 2: error_message_suffix = f"could not be parsed as table or has less than two rows and columns" raise RuntimeError # Kevin: Zakaria please insert the magic down below:) # Zaka: And this is where the magic happens :p # Check if the first row corresponds to transcript Ensembl IDs if not (str(transcript_count_df.iloc[0, 0]).startswith('ENST') or str(transcript_count_df.iloc[1, 0]).startswith('ENST')): error_message_suffix = f"must have Ensembl transcript IDs in the first column starting with \"ENST\"" raise RuntimeError # --- Try parsing counts for the different options (search for FPKM, tpm or counts) # max_isoforms: the max number of isoforms to consider: max_isoforms = int(request.POST['transcript-count-max']) column_names = transcript_count_df.columns # Cufflinks file (or a similar thing) if "FPKM" in column_names: transcript_count_df = transcript_count_df.sort_values( by=['FPKM'], ascending=False) cut_rows = transcript_count_df.iloc[:, 0].unique()[:max_isoforms] print('Input matches cufflinks output ') # Kallisto output counts in tpm elif "tpm" in column_names: transcript_count_df = transcript_count_df.sort_values( by=['tpm'], ascending=False) cut_rows = transcript_count_df.iloc[:, 0].unique()[:max_isoforms] print('Input matches kallisto output ') # Generic count matrix elif "counts" in column_names: transcript_count_df = transcript_count_df.sort_values( by=['counts'], ascending=False) cut_rows = transcript_count_df.iloc[:, 0].unique()[:max_isoforms] print('Input with counts column ') # Could not find the row else: error_message_suffix = "does not contain a column with the counts. The column must be named either \"FPKM\", \"tpm\" or \"counts\"" raise RuntimeError # and let DIGGER do the magic ;) job_num = str(random.randrange(500)) with open(f'{jobs_path}/{job_num}.txt', "wb") as fp: pickle.dump(cut_rows, fp) # Pickling print(f"Starting network analysis with {len(cut_rows)} rows") return redirect(Multi_proteins, job=job_num) except RuntimeError: print("Could not parse uploaded file acorrectly") error_message = f"The uploaded file \"{request.FILES['gene-count-file']}\" {error_message_suffix}." jump_div = 'option2' return render(request, 'setup/network.html', context={ 'error_message': error_message, 'jump_div': jump_div })
import argparse import pandas as pd import sys parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, required=True, help="Input CCF file") parser.add_argument("-b", "--barcodes", type=str, required=True, help="List of barcode (one per line)") parser.add_argument("-o", "--output", type=str, required=True, help="Output CCF file") args = parser.parse_args() if __name__ == "__main__": ccf = pd.read_table(args.input, header=None) with open(args.barcodes, 'r') as f: barcodes = set([line.strip() for line in f]) filtered = ccf[ccf[5].isin(barcodes)] filtered.to_csv(args.output, sep='\t', header=False, index=False)