def load_dataframe(self, file_resources): source_df = dd.read_table(file_resources["NONCODEv5_source"], header=None) source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"] transcript2gene_df = dd.read_table( file_resources["NONCODEv5_Transcript2Gene"], header=None) transcript2gene_df.columns = [ "NONCODE Transcript ID", "NONCODE Gene ID" ] self.noncode_func_df = dd.read_table( file_resources["NONCODEv5_human.func"], header=None) self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"] self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True) # Convert to NONCODE transcript ID for the functional annotation data self.noncode_func_df[ "NONCODE Transcript ID"] = self.noncode_func_df.index.map( pd.Series( transcript2gene_df['NONCODE Transcript ID'].values, index=transcript2gene_df['NONCODE Gene ID']).to_dict()) # Convert NONCODE transcript ID to gene names source_gene_names_df = source_df[source_df["name type"] == "NAME"].copy() self.noncode_func_df["Gene Name"] = self.noncode_func_df[ "NONCODE Transcript ID"].map( pd.Series(source_gene_names_df['Gene ID'].values, index=source_gene_names_df['NONCODE Transcript ID']). to_dict())
def write_probes(probes_summary_filename, probes_dir, n_workers): cluster = LocalCluster(n_workers=n_workers, threads_per_worker=1, memory_limit="16GB") client = Client(cluster) num_record = len([s for s in SeqIO.parse(input_fasta_filename, 'fasta')]) pacbio_sequence_list = pd.DataFrame(index=np.arange(num_record), columns=['SEQID', 'SEQUENCE']) pacbio_sequence_list['SEQID'] = [ s.id for s in SeqIO.parse(input_fasta_filename, 'fasta') ] pacbio_sequence_list['SEQUENCE'] = [ s.seq for s in SeqIO.parse(input_fasta_filename, 'fasta') ] return_code_list = [] for i in range(num_record): return_code = write_blast_pacbio_sequence( pacbio_sequence_list.iloc[i, :], temp_dir) return_code_list.append(return_code) return_code_total = dask.delayed(sum)(return_code_list) result = return_code_total.compute() probes_blast_results = dd.read_table('{}/*.blast.out'.format(temp_dir), delim_whitespace=True, header=None, dtype={13: str}) probes_blast = probes_blast_results.compute() probes_blast_results_filename = os.path.basename(input_fasta_filename) probes_blast_results.to_csv('{}/{}.blast.out'.format( temp_dir, probes_blast_results_filename), index=None, header=None, sep=' ') client.close() cluster.close() return (0)
def load_dataframe(self, file_resources, npartitions=None): """ Args: file_resources: npartitions: """ go_terms = pd.read_table( file_resources["rnacentral_rfam_annotations.tsv"], low_memory=True, header=None, names=["RNAcentral id", "GO terms", "Rfams"]) go_terms["RNAcentral id"] = go_terms["RNAcentral id"].str.split( "_", expand=True, n=2)[0] gene_ids = [] for file in file_resources: if "database_mappings" in file: if npartitions: id_mapping = dd.read_table(file_resources[file], header=None, names=[ "RNAcentral id", "database", "external id", "species", "RNA type", "gene symbol" ]) else: id_mapping = pd.read_table(file_resources[file], low_memory=True, header=None, names=[ "RNAcentral id", "database", "external id", "species", "RNA type", "gene symbol" ]) gene_ids.append(id_mapping) if npartitions: gene_ids = dd.concat(gene_ids, join="inner") else: gene_ids = pd.concat(gene_ids, join="inner") gene_ids["species"] = gene_ids["species"].astype("O") if self.species is not None: gene_ids = gene_ids[gene_ids["species"] == self.species] lnc_go_terms = go_terms[go_terms["RNAcentral id"].isin( gene_ids["RNAcentral id"])].groupby("RNAcentral id")[ "GO terms"].apply(lambda x: "|".join(x.unique())) lnc_rfams = go_terms[go_terms["RNAcentral id"].isin( gene_ids["RNAcentral id"])].groupby( "RNAcentral id")["Rfams"].apply(lambda x: "|".join(x.unique())) gene_ids["GO terms"] = gene_ids["RNAcentral id"].map(lnc_go_terms) gene_ids["Rfams"] = gene_ids["RNAcentral id"].map(lnc_rfams) gene_ids = gene_ids[gene_ids["GO terms"].notnull() | gene_ids["Rfams"].notnull()] return gene_ids
def parse_gtf_dask(filepath_or_buffer, npartitions=None, compression=None, features=None): """ Args: filepath_or_buffer (str or buffer object): npartitions (int): Number of partitions for the dask dataframe. Default None. compression (str): Compression type to be passed into dask.dataframe.read_table(). Default None. features (set or None): Drop entries which aren't one of these features """ if features is not None: features = set(features) def parse_frame(s): if s == ".": return 0 else: return int(s) # GTF columns: # 1) seqname: str ("1", "X", "chrX", etc...) # 2) source : str # Different versions of GTF use second column as of: # (a) gene biotype # (b) transcript biotype # (c) the annotation source # See: https://www.biostars.org/p/120306/#120321 # 3) feature : str ("gene", "transcript", &c) # 4) start : int # 5) end : int # 6) score : float or "." # 7) strand : "+", "-", or "." # 8) frame : 0, 1, 2 or "." # 9) attribute : key-value pairs separated by semicolons # (see more complete description in docstring at top of file) # Uses Dask logging.debug("dask.datafame.read_table, file={}, compression={}".format(filepath_or_buffer, compression)) dataframe = dd.read_table( filepath_or_buffer, sep="\t", compression=compression, blocksize=None, comment="#", names=REQUIRED_COLUMNS, skipinitialspace=True, skip_blank_lines=True, error_bad_lines=True, warn_bad_lines=True, # chunksize=chunksize, engine="c", dtype={ "start": np.int64, "end": np.int64, "score": np.float32, "seqname": str, }, na_values=".", converters={"frame": parse_frame}) return dataframe
def handle_file(filepath: str): df = dd.read_table(filepath, sep='\x01') print(f'Usual length: {usual_length}') print(f'Num lines: {num_lines}') print(f'Num tweet ids: {len(tweet_id_set)}') print(f'Num engaging users: {len(engaging_user_id_set)}') print(f'Num engaged users: {len(engaged_with_user_id_set)}')
def load_dataframe(self, file_resources, npartitions=None): """ Args: file_resources: npartitions: """ if npartitions: df = dd.read_table(file_resources["proteinatlas.tsv"]) else: df = pd.read_table(file_resources["proteinatlas.tsv"]) return df
def read_data_with_cond(data_file_str,reduce_memory=False,cond_and_str=None,output_path_pre=None,sep='\t'): import re print('----------------begin------------------') try: if sep=='\t': data = dd.read_table(data_file_str,low_memory=False,dtype={'uid': 'object'}).compute() if sep==',': data = dd.read_csv(data_file_str,low_memory=False,dtype={'uid': 'object'}).compute() except: if sep=='\t': data = pd.read_table(data_file_str,low_memory=False,dtype={'uid': 'object'}) if sep==',': data = pd.read_csv(data_file_str,low_memory=False,dtype={'uid': 'object'}) print('--initial') print(data.info()) if reduce_memory: print('--reduce_memory') data = reduce_data_memory(data) print(data.info()) if cond_and_str: print('--cond') cnt=1 for cond in cond_and_str.split(','): pattern = re.compile(r'^.*>=.*$') if pattern.match(cond): f,n = cond.split('>=')[0],int(cond.split('>=')[1]) data = data[data[f]>=n] print('shape of data after cond',cnt,':',data.shape) pattern = re.compile(r'^.*==.*$') if pattern.match(cond): f,n = cond.split('==')[0],int(cond.split('==')[1]) data = data[data[f]==n] print('shape of data after cond',cnt,':',data.shape) pattern = re.compile(r'^.*isnull.*$') if pattern.match(cond): f = cond.split('.')[0] data = data[data[f].isnull()] print('shape of data after cond',cnt,':',data.shape) cnt+=1 print(data.info()) print('------------conclusion---------------') print('shape of dataset:',data.shape) print('-------------outputs------------------') if output_path_pre: columns = pd.DataFrame(data.dtypes) columns = columns.reset_index() columns.columns = ['feature_name','dtypes'] columns.to_csv(output_path_pre+'columns.csv',index=False,header=True) print('column names and dtypes have been downloaded to ',output_path_pre+'columns.csv') return data
def main(): parser = argparse.ArgumentParser() parser.add_argument('table_txt_gz') parser.add_argument('--size_cutoff', default=3.8e9) args = parser.parse_args() # 50 MB block size df = dd.read_table(args.table_txt_gz, blocksize=50000000, sep='\t', header=None, encoding='latin-1') base, ext = os.path.splitext(args.table_txt_gz) df.to_csv(base + '_part*.txt', header=False, index=False, sep='\t') return
def load_file_in_staging(file_path, table_name): """Load content of file inside *_stg tables.""" tmp_df = dd.read_table(file_path, header=0, sep=';', dtype='str', encoding='latin-1') logger.info('Uploading {} into {}.'.format(file_path, table_name)) conn = f'sqlite:///{db_staging_file}' tmp_df.to_sql(table_name, conn, index=False, if_exists='replace', chunksize=200000) logger.info('Imported {} rows.'.format(tmp_df.shape[0])) del tmp_df return
def get_gc(label_file, genome_file): """Calculates gc content for all viable entries in an input dataframe. Arguments: label_file {dataframe} -- [Dataframe containing genomic regions labeled as positive(1) or negative(0)] genome_file {str} -- [Path to a refrence genome in FASTA format] Returns: [dataframe] -- [Dataframe containing viable entries and their respective gc content] """ bed_df = pybedtools.BedTool(label_file) bed_gc = bed_df.nuc(genome_file) gc_df = dd.read_table(bed_gc.fn) gc_df = gc_df.loc[:, gc_df.columns.str.contains("usercol|gc|num_N")] colnames = generate_colnames(gc_df) gc_df.columns = colnames gc_df = gc_df.loc[gc_df.num_N == 0].drop("num_N", axis=1) gc_df["gc"] = gc_df["gc"].astype("float32") * 100 return gc_df
def dask_read(option, file_path): # Python map for file type pattern file_type = { 'parquet': file_path + '/*.parquet', 'csv': file_path + '/*.csv', 'json': file_path + '/*.json', 'text': file_path + '/*.txt' } # Define reader type by pattern mapping file_pattern = file_type[option] dask_reader = { 'parquet': dask_df.read_parquet(file_pattern, engine='pyarrow'), 'csv': dask_df.read_csv(file_pattern), 'json': dask_df.read_json(file_pattern), 'text': dask_df.read_table(file_pattern) } return dask_reader[option]
def read_sample(SAMPLE_FOLDER): ''' read blast output ''' col_names = [ 'qseqid', 'qlen', 'sseqid', 'slen', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue' ] tRF_tab = SAMPLE_FOLDER + '/blast.tRF.tsv' samplename = os.path.basename(SAMPLE_FOLDER) print('Reading %s' % samplename) tRF_df = dd.read_table(tRF_tab, names = col_names, )\ .repartition(npartitions=THREADS) \ .query('slen == qlen ') \ .groupby('qseqid')\ .apply(lambda d: d.nlargest(1, 'pident'))\ .compute(workers=THREADS, scheduler='threads')\ .reset_index(drop=True) \ .assign(samplename = samplename) return tRF_df
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get("path") if file_type == "csv": return dd.read_csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return dd.read_parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "hdf": return dd.read_hdf(path, **dict_without_keys(file_options, "path")) elif file_type == "json": return dd.read_json(path, **dict_without_keys(file_options, "path")) elif file_type == "sql_table": return dd.read_sql_table(**file_options) elif file_type == "table": return dd.read_table(path, **dict_without_keys(file_options, "path")) elif file_type == "fwf": return dd.read_fwf(path, **dict_without_keys(file_options, "path")) elif file_type == "orc": return dd.read_orc(path, **dict_without_keys(file_options, "path")) else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type))
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get('path') if file_type == 'csv': return dd.read_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': return dd.read_parquet(path, **dict_without_keys(file_options, 'path')) elif file_type == 'hdf': return dd.read_hdf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'json': return dd.read_json(path, **dict_without_keys(file_options, 'path')) elif file_type == 'sql_table': return dd.read_sql_table(**file_options) elif file_type == 'table': return dd.read_table(path, **dict_without_keys(file_options, 'path')) elif file_type == 'fwf': return dd.read_fwf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'orc': return dd.read_orc(path, **dict_without_keys(file_options, 'path')) else: raise DagsterInvariantViolationError( 'Unsupported file_type {file_type}'.format(file_type=file_type))
client = Client(processes=False, threads_per_worker=4, n_workers=4, memory_limit='2GB') # cmu list split helper function def cmu_list_split_helper(x, rhyme_length=3): splitted = x.split() return (splitted[0], splitted[-rhyme_length:]) # glove stuff glove_data_file = "../data/glove/glove.6B.100d_1000lines.txt" gloves = dd.read_table(glove_data_file, sep=" ", header=None, quoting=csv.QUOTE_NONE) gloves = gloves.set_index(0) print(gloves.compute().head()) # cmu stuff cmu_data_file = "../data/cmu/cmudict_1000lines.dict" phone_seqs = db.read_text(cmu_data_file) phone_seqs = phone_seqs.map(cmu_list_split_helper) phone_seqs = phone_seqs.to_dataframe() phone_seqs = phone_seqs.set_index(0) # preprocess gloves and cmus together def preprocess_df(df): return df
target, context = int(target), int(context) if (target, context) in count_dict: count_dict[(target, context)] += 1 else: count_dict[(target, context)] = 1 # dict chunk by chunk from dask.distributed import Client client = Client(n_workers=5, threads_per_worker=2, processes=False, memory_limit='2GB') import dask.dataframe as dd pair_data = dd.read_table( '/home/srawat/Documents/UMBC+Wiki/dsm_files/tuples.txt') pair_data.rename(columns={2822: 'target', 80: 'context'}) # Tuple to dict import pickle from tqdm import tqdm count_dict = dict() with open('/home/srawat/Documents/UMBC+Wiki/dsm_files/tuples.txt', 'r') as g: for line in tqdm(g): target, context = line.split('\t') target = int(target) context = int(context) if (target, context) in count_dict: count_dict[(target, context)] += 1 else: count_dict[(target, context)] = 1
else: return remove_duplicated_white_space(raw_line[16:]).split(" ")[1].replace(":","") def data_raw_parser(*arg): print(*arg) if len(arg) == 2: return logtype_parser(arg[0],arg[1]) elif len(arg) == 3: return tm_parser(arg[0],arg[1],arg[2]) elif len(arg)>3: return env_parser(arg) row_index=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21'] parse_dates={'logtype':[6,17], 'tm':[0,1,6], 'env':[10,11,12,13,14,15,16,17,18,19,20,21]} s=time.time() df = dd.read_table("/data/datalake/fitslake/datalab_backup/vltlogs/raw_logs/2016/10/wcnnaco.2016-10-01.log", delim_whitespace=True, names=row_index, encoding='latin-1', skiprows=0) df.compute() #logtype = df.apply(logtype_parser,axis=1) #tm = df.apply(tm_parser,axis=1) #env = df.apply(env_parser, axis=1) #logtype.compute() #print(df) #logtype.to_csv("/data/datalake/fitslake/datalab_backup/vltlogs/raw_logs/2016/10/prueba-*.csv") print(time.time()-s) #print(logtype.compute()[0:5]) #print(tm.compute()[0:5]) #print(env.compute()[10:20]) #print(df.compute()['6']) #print(time.time()-s)
def run_pipeline(task_type: str) -> bool: map: Dict = task_type_map[task_type] in_bucket: str = map['in'] out_bucket: str = map['out'] cols: Dict[str, str] = map['cols'] converters: Dict[str, Callable] = map['converters'] dtypes: Dict[str, str] = map['dtypes'] index_col: str = map['index']['col'] sorted: bool = map['index']['sorted'] row_op: Callable = map['row_op'] diff: Dict = map['diff'] filter_by_key: str = resample_map['filter_by']['key'] filter_by_val: int = resample_map['filter_by']['value'] resample_freq: str = resample_map['freq'] aggr_func: Callable = map['aggr_func'] try: #client = Client(address='dscheduler:8786') s3_in_url: str = 's3://'+in_bucket+'/*.*' s3_options: Dict = ps.fetch_s3_options() #df = dd.read_table(path=s3_in_url, storage_options=s3_options) df = dd.read_table(urlpath='tmp/'+in_bucket+'/*.*', header=0, usecols=lambda x: x.upper() in list(cols.keys()), skipinitialspace=True, converters=converters ) # rename columns df = df.rename(columns=cols) df.compute() if sorted: df = df.map_partitions(lambda pdf: pdf.rename(columns=cols) .apply(func=row_op, axis=1), meta=dtypes).compute() else: df = df.map_partitions(lambda pdf: pdf.rename(columns=cols) .set_index(index_col).sort().reset_index() .apply(func=row_op, axis=1), meta=dtypes).compute() # map row-wise operations #df = df.map_partitions(lambda pdf: pdf.apply(func=row_op, axis=1), meta=dtypes) # diff if diff['compute']: df[diff['new_col']] = df[diff['col']].diff() # specific processing for transit if task_type == 'cl-transit': df = df.map_partitions(partial(remove_outliers, col='DELEXITS'), meta=dtypes) # drop na values df = df.dropna() # set index (assumes pre-sorted data) df = df.set_index(index_col, sorted=True) #df.compute() # filter if filter_by_key == 'weekday': df = df.loc[df[index_col].weekday() == filter_by_val] # resample using frequency and aggregate function specified df = compose(df.resample(resample_freq), aggr_func) # save in out bucket s3_out_url: str = 's3://' + out_bucket # dd.to_parquet(df=df, path=s3_out_url, storage_options=s3_options) dd.to_parquet(df=df, path='tmp/'+out_bucket+'/*.*') except Exception as err: print('error in run_pipeline %s' % str(err)) raise err return True
colnames.drop(columns="to_drop", inplace=True) new_accession_list = list( colnames[-colnames["Run_accession"].isin(polish_samples)] ["Run_accession"]) leave_out = new_accession_list client = Client() for p in partitions: print("Partition " + p + " started being analyzed in client:") print(client) begin_time = time.time() #Read partition partition=dd.read_table("/pasteur/sonic/scratch/public/cduitama/RascovanProject/kmMatrices/combination/large_dataset_no_polish/matrices/matrix_"\ +p+".txt",header=None,sep=" ",names=["Kmer"]+list(colnames["Run_accession"])) #Drop K-mer column partition_array = partition.drop(["Kmer"], axis=1) #convert from dask array to np array partition_array = partition_array.values #Binarization transformer = Binarizer().fit(partition_array) # fit does nothing. partition_b = transformer.transform(partition_array) print("Finished reading and binarizing partition " + str(p)) for sink in leave_out:
# define arguments parser = argparse.ArgumentParser( description='sum the number of variants per gene in an individual') parser.add_argument('-v', '--variants', dest='variants', help='variant table') parser.add_argument('-g', '--genes', dest='genes', help='genes of interest') parser.add_argument('-o', '--out', dest='out', help='output file name') args = parser.parse_args() # if this isn't working for you, see https://distributed.dask.org/en/latest/setup.html # or talk to your friendly IT professional client = Client() client.restart() # read in table of variants variants = ddf.read_table(args.variants, blocksize=50e6) # 50 MB blocks #print variants.head() print 'variants read in' # define list of genes of interest with open(args.genes) as g: genes_of_interest = g.read().splitlines() # make list of EUR proband IDs # I know nothing about this size, but consider passing in an index, which will make operations later on faster # if it makes sense for what your trying to do # e.g. I imagine indexing by gene might be useful # see http://docs.dask.org/en/latest/dataframe-performance.html#use-the-index master = ddf.read_table( "/scratch/ucgd/lustre/work/u0806040/data/15_Jan_19_Simons_master_ancestry_corrected_PRS.txt", blocksize=
#Load colnames colnames=pd.read_csv("kmMatrices/combination/large_dataset/combination_fof.txt", sep=" : ",\ header=None,names=["Run_accession","to_drop"],engine="python") colnames.drop(columns="to_drop",inplace=True) leave_out=list(colnames["Run_accession"]) for p in partitions: print("Partition "+ p +" started being analyzed in client:") print(client) begin_time = datetime.datetime.now() #Read partition partition=dd.read_table("kmMatrices/combination/large_dataset/storage/matrix/partition_"+p+"/ascii_matrix"+p+".mat",\ skiprows=8,header=None,sep=" ",names=["Kmer"]+list(colnames["Run_accession"])) #Drop K-mer column partition_array=partition.drop("Kmer",axis=1) #convert from dask array to np array partition_array=partition_array.values #Binarization transformer = Binarizer().fit(partition_array); # fit does nothing. partition_b=transformer.transform(partition_array); print("Finished reading and binarizing partition "+ str(p)) for i in leave_out: sinks=[i]
import glob import numpy as np import pandas as pd import os import dask.dataframe as dd repeats = dd.read_csv("repeats_hg19.csv") anno = dd.read_table("RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.CATGAC.dan.anno") df1 = dd.merge(anno, repeats, on="chr", how="outer", suffixes=("","_repeat")) df1.to_csv("find_repeatsTESToutput.csv", index=False) df1 = df1[(repeats.chr == row.chr) & (anno.start >= repeats.begin) & (anno.start <= repeats.end)] df1 = dd.merge(anno, df1, on = ["chr"]) df1.to_csv("find_repeatsTEST2.csv", index=False).compute(num_workers=20)
def parse_gtf(filepath_or_buffer, npartitions=None, chunksize=1024 * 1024, features=None, intern_columns=["seqname", "source", "strand", "frame"], fix_quotes_columns=["attribute"]): """ Args: filepath_or_buffer (str or buffer object): npartitions: chunksize (int): features (set or None): Drop entries which aren't one of these features intern_columns (list): These columns are short strings which should be interned fix_quotes_columns (list): Most commonly the 'attribute' column which had broken quotes on some Ensembl release GTF files. """ if features is not None: features = set(features) dataframes = [] def parse_frame(s): if s == ".": return 0 else: return int(s) # GTF columns: # 1) seqname: str ("1", "X", "chrX", etc...) # 2) source : str # Different versions of GTF use second column as of: # (a) gene biotype # (b) transcript biotype # (c) the annotation source # See: https://www.biostars.org/p/120306/#120321 # 3) feature : str ("gene", "transcript", &c) # 4) start : int # 5) end : int # 6) score : float or "." # 7) strand : "+", "-", or "." # 8) frame : 0, 1, 2 or "." # 9) attribute : key-value pairs separated by semicolons # (see more complete description in docstring at top of file) if npartitions: logging.info(filepath_or_buffer) chunk_iterator = dd.read_table( filepath_or_buffer, sep="\t", comment="#", names=REQUIRED_COLUMNS, skipinitialspace=True, skip_blank_lines=True, error_bad_lines=True, warn_bad_lines=True, # chunksize=chunksize, engine="c", dtype={ "start": np.int64, "end": np.int64, "score": np.float32, "seqname": str, }, na_values=".", converters={"frame": parse_frame}) else: chunk_iterator = pd.read_csv(filepath_or_buffer, sep="\t", comment="#", names=REQUIRED_COLUMNS, skipinitialspace=True, skip_blank_lines=True, error_bad_lines=True, warn_bad_lines=True, chunksize=chunksize, engine="c", dtype={ "start": np.int64, "end": np.int64, "score": np.float32, "seqname": str, }, na_values=".", converters={"frame": parse_frame}) dataframes = [] try: for df in chunk_iterator: for intern_column in intern_columns: df[intern_column] = [intern(str(s)) for s in df[intern_column]] # compare feature strings after interning if features is not None: df = df[df["feature"].isin(features)] for fix_quotes_column in fix_quotes_columns: # Catch mistaken semicolons by replacing "xyz;" with "xyz" # Required to do this since the Ensembl GTF for Ensembl # release 78 has mistakes such as: # gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201" df[fix_quotes_column] = [ s.replace(';\"', '\"').replace(";-", "-") for s in df[fix_quotes_column] ] dataframes.append(df) except Exception as e: raise Exception("ParsingError:" + str(e)) if npartitions: df = dd.concat(dataframes) else: df = pd.concat(dataframes) return df
parser.add_argument('-o', '--out', dest = 'out', help = 'output file name') args = parser.parse_args() # define list of genes of interest with open(args.genes) as g: genes_of_interest = g.read().splitlines() # make list of EUR proband IDs master = pandas.read_table("/scratch/ucgd/lustre/work/u0806040/data/15_Jan_19_Simons_master_ancestry_corrected_PRS.txt", dtype={'other_dx_axis_i': 'object', 'other_dx_axis_ii': 'object', 'other_dx_icd': 'object'}) probands = master.loc[master['family_member'] == 'p1'] eur_probands = probands.loc[probands['ancestry.prediction'] == 'EUR'] proband_ids = eur_probands['IID'] # read in table of variants print('reading in variants') variants = ddf.read_table(args.variants) #print variants.head() # filter variants print('setting up variant filters') # medium and and high impact variants1 = variants[variants.impact.isin(['MED', 'HIGH'])] # in gens of interest variants2 = variants1[variants1.gene.isin(genes_of_interest)] # convert back to pandas now that the data frame is small print('computing and returing pandas data frame') voi = variants2.compute() # reorganize data frame so that rows are genes of interest, columns are IIDs and value are coutns of variants
sources = list(colnames["Run_accession"]) classes = sorted(list(set(metadata["True_label"]))) #Sort metadata according to column order in matrix DataFrame sorted_metadata = pd.DataFrame(columns=metadata.columns) for j in sources: sorted_metadata = pd.concat( [sorted_metadata, metadata[metadata["Run_accession"] == j]]) sorted_metadata.reset_index(drop=True, inplace=True) #Build result dataframe result = pd.DataFrame(columns=classes + ["Unknown", "Running time", "Sink"]) #Load k-mer matrix of sources as dataframe partition=dd.read_table("matrix_100.pa.txt",header=None,sep=" ",\ names=["Kmer"]+list(colnames["Run_accession"])) #Drop K-mer column partition_array = partition.drop(["Kmer"], axis=1) #Define M' matrix of sources M_prime = partition_array.values M_prime.compute_chunk_sizes() #M_prime=M_prime.persist() print("Chunk sizes for the M_prime matrix were computed") #Create new vector for sink s_t = dd.read_table(path_sink, header=None, names=["pa"]) s_t = s_t["pa"] s_t = s_t.values