def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line tanimoto = args.tanimoto with open(args.mpfa, 'r') as inf: # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values cluster_map = build_cluster_map(inf, bread=args.bread) with open(args.input, 'r') as inf2: inkey = generate_index_list(inf2) print('\nOk, processing input file...\n') with open(args.input, 'r') as in_csv2: headers = generate_chunk_list(in_csv2) c_list = list(cluster_map.keys()) grabbed_clusters = [] data_to_pool = [] # print(c_list) for cluster in c_list: grab = pick_a_cluster(headers, cluster) # uses the name of the cluster to get a list of all orfs for a particular unique cluster # print(grab) if not grab: pass else: # print(grab) grabbed_clusters.extend([cluster]) with open(args.input, 'r') as inf3: mx = pd.read_csv(inf3, sep=',', header=0, usecols=grab, engine='c') # loads in only the columns from the grab list, i.e. all cols for a unique cluster mx.index = inkey # reindexes the df with the orf labels after importing specific columns with usecols data_to_pool.append(mx) dlen = len(data_to_pool) print('Built the data list of %s clusters' % dlen) args_list = [cluster_map, c_list] # organizes all the arguments that the parallelized function needs into a list print('\nSending data to Workers... work, Workers, work!\n') if args.tanimoto: if __name__ == '__main__': results = list(futures.map(partial(parallel_tanimoto, args_list=args_list), data_to_pool)) outdf = pd.concat(results, axis=1) if not args.tanimoto: if __name__ == '__main__': results = list(futures.map(partial(parallel_minicluster, args_list=args_list), data_to_pool)) outdf = pd.concat(results, axis=1) # bigmat = pd.concat(results, axis=0) # stack all the results into a single column in a dataframe # print(bigmat.shape[0]) # bigmat.index = c_list # now the index is just the clusters, not the orfs # print(bigmat) print('File processing complete; writing output file...\n') del data_to_pool with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: # outdf = pd.concat(results, axis=1) outdf.columns = grabbed_clusters # names the columns (and index, next line) according to clusters in the order they were processed outdf.index = c_list outdf.sort_index(axis=0, inplace=True) outdf.sort_index(axis=1, inplace=True) outdf = outdf.round(decimals=3) outdf.to_csv(outf)
def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line cpus = args.cpus num_cpus = cpu_count() tanimoto = args.tanimoto if cpus > num_cpus: print( '\nError: Number of requested processors exceeds hardware available!' ) print('Maximum processors available is %s.\n' % num_cpus) sys.exit() with open(args.mpfa, 'r') as inf: # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values cluster_map = build_cluster_map(inf, bread=args.bread) # intype = str(args.input).split('.')[-1] with open(args.input, 'r') as inf2: inkey = generate_index_list(inf2) with open(args.input, 'r') as in_csv2: headers = generate_chunk_list(in_csv2) c_list = list(cluster_map.keys()) ct = len(c_list) print('Found %d clusters...' % ct) results_list = [] grabbed_clusters = [] j = 0 for cluster in c_list: grab = pick_a_cluster( headers, cluster ) # uses the name of the cluster to get a list of all orfs for a particular unique cluster # print(grab) if not grab: pass else: grabbed_clusters.extend([cluster]) with open(args.input, 'r') as inf3: bigmat = big_cluster_completeness(inf3, grab, inkey, cluster, cluster_map, cpus, tanimoto, c_list, j) results_list.append( bigmat ) # returns a list of dataframes, one for each cluster column j += 1 print('File processing complete; writing output file...\n') with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: outdf = pd.concat(results_list, axis=1) outdf.columns = grabbed_clusters # names the columns (and index, next line) according to clusters in the order they were processed outdf.index = c_list outdf.sort_index(axis=0, inplace=True) outdf.sort_index(axis=1, inplace=True) outdf = outdf.round(decimals=3) outdf.to_csv(outf)
def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line with open(args.mpfa, 'r') as inf: # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values cluster_map = build_cluster_map(inf, bread=args.bread) with open(args.input, 'r') as in_csv: print('\nOk, processing input file in pieces...\n') inkey = generate_index_list(in_csv) # print(len(inkey)) with open(args.input, 'r') as in_csv2: headers = generate_chunk_list(in_csv2) # print(len(headers)) c_list = list(cluster_map.keys()) # ct = len(c_list) # print('Found %d clusters...' % ct) data_to_pool = [] grabbed_clusters = [] for cluster in c_list: grab = pick_a_cluster( headers, cluster ) # uses the name of the cluster to get a list of all orfs for a particular unique cluster if not grab: pass else: # print(grab) grabbed_clusters.extend([cluster]) with open(args.input, 'r') as inf3: mx = pd.read_csv( inf3, sep=',', header=0, usecols=grab, engine='c' ) # loads in only the columns from the grab list, i.e. all cols for a unique cluster mx.index = inkey # reindexes the df with the orf labels after importing specific columns with usecols data_to_pool.append( mx) # create the list of dfs to map over for multiprocessing if __name__ == '__main__': print('\nSending data to Workers... work, Workers, work!') results = list( futures.map(partial(parallel_clustermean, c_list=c_list), data_to_pool)) print('\nFile processing complete; writing output file...\n') del data_to_pool with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: outdf = pd.concat(results, axis=1) outdf.columns = grabbed_clusters # names the columns (and index, next line) according to clusters in the order they were processed # outdf.index = c_list outdf.sort_index( axis=0, inplace=True ) # ensure that the clusters are in order on cols and rows outdf.sort_index(axis=1, inplace=True) outdf.to_csv(outf)
def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line if args.synthesize: final_df = synthesize_chunks() with open(args.output, 'w') as outf: final_df.to_csv(outf) print('\nMerged data written to file... exiting...\n') sys.exit() with open(args.mpfa, 'r') as inf: # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values cluster_map = build_cluster_map(inf, bread=args.bread) with open(args.input, 'r') as in_csv: print('\nOk, processing input file...\n') big_df = pd.read_csv(in_csv, sep=',', header=0, index_col=0, engine='c') inkey = list(big_df.index) # inkey = generate_index_list(in_csv) c_list = list(cluster_map.keys()) ct = len(c_list) n = int(args.cutsize) print('Found %d clusters... Making groups of %d clusters...' % (ct, n)) # Make a list of lists of clusters, to guide the breaking up of the csv bcl = [c_list[i:i + n] for i in range(0, len(c_list), n)] print('\nMaster list generated... now doing the splits!') p = 1 for c in bcl: grab_chunk = [] for cluster in list(c): grab = pick_a_cluster( inkey, cluster ) # uses the name of the cluster to get a list of all orfs for a particular unique cluster grab_chunk.extend(grab) chunk_df = big_df[grab_chunk] outf = args.output if outf.endswith('.csv'): outf.replace('.csv', '') outf = '_'.join([outf, str(p), '.csv']) chunk_df.to_csv(outf) print('\nSaved matrix chunk %d...' % p) del chunk_df p += 1
def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line if args.synthesize: final_df = synthesize_chunks() with open(args.output, 'w') as outf: final_df.to_csv(outf) print('\nMerged data written to file... exiting...\n') sys.exit() with open(args.mpfa, 'r') as inf: # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values cluster_map = build_cluster_map(inf, bread=args.bread) with open(args.input, 'r') as in_csv: print('\nOk, processing input file in pieces...\n') inkey = generate_index_list(in_csv) c_list = list(cluster_map.keys()) ct = len(c_list) n = int(args.cuts) print('Found %d clusters... Making %d cuts...' % (ct, n)) # g = orfct / cuts # if not g.is_integer(): # g = int(g) + 1 bcl = [c_list[i:i + n] for i in range(0, len(c_list), n)] print('\nMaster list generated... now doing the splits!') p = 1 for c in bcl: grab_chunk = [] for cluster in list(c): grab = pick_a_cluster( inkey, cluster ) # uses the name of the cluster to get a list of all orfs for a particular unique cluster grab_chunk.extend(grab) with open(args.input, 'r') as inf3: mx = pd.read_csv( inf3, sep=',', header=0, usecols=grab_chunk, engine='c' ) # loads in only the columns from the grab list, i.e. all cols for a unique cluster mx.index = inkey # reindexes the df with the orf labels after importing specific columns with usecols # data_to_pool.append(mx) # create the list of dfs to map over for multiprocessing outf = args.output if outf.endswith('.csv'): outf.replace('.csv', '') outf = '_'.join([args.output, str(p), '.csv']) mx.to_csv(outf) print('\nSaved a matrix chunk...') p += 1
def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line with open(args.mpfa, 'r') if args.mpfa != '-' else sys.stdin as inf: # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values cluster_map = build_cluster_map(inf, bread=args.bread) intype = str(args.input).split('.')[-1] insize = os.stat(args.input).st_size with open(args.input, 'r') as inf2: with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: if not args.pieces: outdf = cluster_completeness(intype, cluster_map, inf2) outdf = outdf.round(decimals=3) outdf.to_csv(outf) else: print('\nOk, processing input file in pieces...\n') inkey = generate_index_list(inf2) c_list = list(cluster_map.keys()) ct = len(c_list) print('Found %d clusters...' % ct) mat = np.zeros( (ct, ct) ) # initializes an array of the dimensions necessary to fit all cluster results j = 0 for cluster in c_list: grab = pick_a_cluster(inkey, cluster) # print(grab) with open(args.input, 'r') as inf3: mat = big_cluster_completeness( grab, inkey, cluster, cluster_map, c_list, inf3, mat, j) # print(mat) j += 1 print('File processing complete; writing output file...\n') outdf = pd.DataFrame(mat, dtype=float) outdf.columns = c_list # names the columns (and index, next line) according to clusters in the order they were processed outdf.index = c_list outdf.sort_index(axis=0, inplace=True) outdf.sort_index(axis=1, inplace=True) outdf = outdf.round(decimals=3) outdf.to_csv(outf)
def main(): parser = make_arg_parser() args = parser.parse_args() # Parse command line with open(args.mpfa, 'r') as inf: # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values cluster_map = build_cluster_map(inf, bread=args.bread) with open(args.input, 'r') as in_csv: with open(args.output, 'w') if args.output != '-' else sys.stdout as outf: print('\nOk, processing input file in pieces...\n') inkey = generate_index_list(in_csv) c_list = list(cluster_map.keys()) ct = len(c_list) print('Found %d clusters...' % ct) results_list = [] j = 0 for cluster in c_list: grab = pick_a_cluster( inkey, cluster ) # uses the name of the cluster to get a list of all orfs for a particular unique cluster # print(grab) with open(args.input, 'r') as inf3: bigmat = big_cluster_v_cluster(inf3, grab, inkey, c_list, j) # print(bigmat) results_list.append( bigmat ) # returns a list of dataframes, one for each cluster column j += 1 print('File processing complete; writing output file...\n') outdf = pd.concat(results_list, axis=1) outdf.columns = c_list # names the columns (and index, next line) according to clusters in the order they were processed outdf.index = c_list outdf.sort_index(axis=0, inplace=True) outdf.sort_index(axis=1, inplace=True) outdf = outdf.round(decimals=2) outdf.to_csv(outf)