def parse_SPADES(contigs,oldVersion=False,export_contig_data=None,export_contig_graph=None): result = [] for c in contigs: spades_stats = parse_SPADES_contig(c,oldVersion=oldVersion) if spades_stats is not None: result.append(spades_stats) else: print("Failure to parse SPADEs contig") contig_table = pd.DataFrame(result) ### TODO: remove this to a higher level if has_plt: try: if isinstance(export_contig_graph,str): fig = contig_table.plot(kind='scatter', x='Contig_Size',y='Coverage',logx=True,logy=True) fig = fig.get_figure() fig.savefig(export_contig_graph) except Exception as e: print('Failed to save contig stats scatterplot at '+export_contig_graph) utilities.printExceptionDetails(e) try: if isinstance(export_contig_data,str): contig_table[ContigHeaders[0:-1]].to_csv(export_contig_data,index=False) except: print('Failed to save contig stats table at '+export_contig_data) raise return contig_table
def BeforeAndAfter(pre_stats,post_stats): rename_raw = {x:x+'_raw' for x in pre_stats.columns} merged_stats= pd.merge(pre_stats.rename(columns=rename_raw),post_stats,left_index=True,right_index=True,how='outer')#Each should have a single index. merged_stats.fillna('N/A', inplace=True) try: if ('Bases_In_Contigs' in merged_stats) and ('Bases_In_Contigs_raw' in merged_stats): ##Should always be integers merged_stats['Discarded_Bases'] = merged_stats.Bases_In_Contigs_raw.astype(int) - merged_stats.Bases_In_Contigs.astype(int) merged_stats['Discarded_Percent'] = 100*merged_stats.Discarded_Bases.astype(int)/merged_stats.Bases_In_Contigs_raw.astype(int) if ('HalfCov_Contig_Bases' in merged_stats): merged_stats['HalfCov_Percent'] = 100*merged_stats.HalfCov_Contig_Bases/merged_stats.Bases_In_Contigs.astype(int) except Exception as e: utilities.printExceptionDetails(e) return merged_stats
def multiple(multi_args): if multi_args.force and multi_args.resume: print( "Exiting: the options 'force' and 'resume' are incompatible. Use only 'force' if you want to overwrite prior files." ) return 1 output_dir = multi_args.output if multi_args.output else utilities.safeMakeOutputFolder( _outputBase) utilities.safeMakeDir(output_dir) logFile = os.path.join(output_dir, "AssemblyCleanup.log") resultFile = os.path.join(output_dir, "AssemblyCleanupTable.tab") tempFile = utilities.appendToFilename(resultFile, '_temp') sys.stdout = utilities.Logger(logFile) assembler_name = None if multi_args.assembler is None else multi_args.assembler.lower( ) print("Parameters:") for k, v in vars(multi_args).items(): print('{} : {}'.format(k, v)) draft_location = multi_args.draft_location if os.path.isfile(draft_location): guideFrame = pd.read_table(draft_location) print('Loaded guide table from ' + draft_location) print("\t table contains {} records".format(len(guideFrame))) elif os.path.isdir(draft_location): print("Searching for files in " + os.path.abspath(draft_location)) deep_search = False if multi_args.shallow_search_assemblies else True guideFrame = NGS_data_utilities.listGenomeFilesWithNames( draft_location, deep_search=deep_search, extension=multi_args.extension) ##Exclude reads size_limit = multi_args.size_limit if size_limit > 0: guideFrame['filesize'] = guideFrame.Filename.apply(os.path.getsize) small_enough = (guideFrame.filesize <= size_limit) if sum(small_enough) < len(guideFrame): print('Only {} of {} files pass the upper size limit of {}'. format(sum(small_enough), len(guideFrame), size_limit)) guideFrame = guideFrame[small_enough].copy() guideFrame = guideFrame[NGS_data_utilities.dfHeaders].copy() if guideFrame is None or (len(guideFrame) == 0): print("Exiting. Failed to retrieve any files") return 1 if assembler_name: guideFrame['assembler'] = assembler_name print('assigned assembler to be ' + assembler_name) else: #This is not passed to AssemblyStats for i in guideFrame.index: if 'spades' in guideFrame.loc[i, 'Filename'].lower(): guideFrame.loc[i, 'assembler'] = 'spades' print('assigned assembler to be spades for {}'.format( guideFrame.loc[i, 'Lab_ID'])) print('Calculating raw stats...') assemblyStats = AssemblyStats.calculateStats( guideFrame.Filename.tolist(), ass_format=assembler_name, image_dir=output_dir ) ##This will independently infer assembler from name unless given assemblyStats['Contig_Count'] = assemblyStats['Contig_Count'].astype( int) if assemblyStats is None or len(assemblyStats) == 0: print("Exiting failed to calculate assembly stats on input") return 1 guideFrame = pd.merge( guideFrame, assemblyStats, how='left' ) ##Should merge on Filename. Don't want confusion if they share other fields if multi_args.BCFB_PacBio_Name: print('interpreting BCFB PacBio names...') for i in guideFrame.index: guideFrame.loc[i, 'Gaps'] = False if '.ro1m.' in guideFrame.loc[ i, 'Filename'] else True else: guideFrame[ 'Gaps'] = True ### Assume no closed genomes unless stated else: print("Exiting. Unable to find the location of draft files: {}".format( draft_location)) return (1) print('Loaded data...') process = None if multi_args.reorient: process = 'RO' elif multi_args.discard: process = 'DIS' elif multi_args.discard_then_reorient: process = 'DIS_RO' else: print("Exiting. No processing specified") return (1) expectedArgs = set(['working_dir', 'report_file', 'assembler']) # circle_new_start=None,reverse_contig=None,closed_circle=None,broken_circle=None,circularize_with_Ns=0, # length=250,coverage=10,report_file=None,reference=None,assembler=None if 'RO' in process: expectedArgs.update(RO_argset) if not os.path.isfile(multi_args.reference): print("Cannot find reference file. Exiting") return 1 if 'DIS' in process: expectedArgs.update(DIS_argset) tag = multi_args.tag if multi_args.tag else process print('Result files will have the tag "{}"'.format(tag)) ##TODO test columns here permitted_fields = req_fields + list(expectedArgs) keep_fields = [x for x in guideFrame.columns if x in permitted_fields] parameterFrame = guideFrame[keep_fields].copy() if len(parameterFrame) == 0: return 1 ##Failuer fail_list = [] for i, row in parameterFrame.iterrows( ): ##Row gets converted to keyword arguments; shares index with guideFrame assembly_file = row['Filename'] if not os.path.isfile(assembly_file): print("Error: unable to find file: {}".format(assembly_file)) output_file = 'error. ' else: print("Working on " + os.path.basename(assembly_file)) print("\tat {}".format(time.ctime())) del row['Filename'] if 'Contig_Count' in row.index: if (str(row['Contig_Count']) == str(1)): gaps = row['Gaps'] gap_bool = True ##Safest default (will introduce contig breaks). But should probably skip reorientation if isinstance(gaps, str): if gaps.upper() == 'TRUE': gap_bool = True elif gaps.upper() == 'FALSE': gap_bool = False else: print("unable to interpret 'gaps' notation: {}". format(gaps)) continue elif isinstance(gaps, bool): gap_bool = gaps else: print("unable to interpret 'gaps' notation: {}".format( gaps)) continue if gap_bool: row['broken_circle'] = True ##NOTE: with our bacteria, we assume circle else: row['closed_circle'] = True del row['Gaps'] assembly_basename = utilities.appendToFilename( os.path.basename(assembly_file), '_' + tag) output_file = os.path.join(output_dir, assembly_basename) report_file = os.path.join( output_dir, os.path.basename(assembly_file)) + '.report.txt' has_out = os.path.isfile(output_file) has_rpt = os.path.isfile(report_file) if has_out or has_rpt: if multi_args.force: if has_out: print( "Removing prexisting file: {}".format(output_file)) os.remove(output_file) if has_rpt: print("Removing pre-existing file: {}".format( report_file)) os.remove(report_file) else: if not multi_args.resume: print( "Error: Refusing to overwrite pre-existing output files: \n\t{}\n\t{}" .format(output_file, report_file)) continue try: open(output_file, 'a').close() os.remove(output_file) except IOError: print( "Error. Do not have permission to write to output file \n\t{}" .format(output_file)) continue cleanup_args = vars(multi_args).copy() ##TODO: put this up front? cleanup_args.update(row.to_dict()) cleanup_args['working_dir'] = os.path.join(output_dir, 'work') cleanup_args = { k: v for k, v in cleanup_args.items() if k in expectedArgs } if 'Mean_Coverage' in row.index: proportion_cutoff = multi_args.coverage_proportion * row.loc[ 'Mean_Coverage'] min_coverage = max(multi_args.coverage, proportion_cutoff) cleanup_args['coverage'] = min_coverage del cleanup_args['Mean_Coverage'] else: cleanup_args[ 'coverage'] = multi_args.coverage ##This should actually be irrelevant -- try: print("Arguments:") print(cleanup_args) if cleanupAndWrite(assembly_file, output_file, report_file=report_file, **cleanup_args) != 0: ##TODO: return stats output_file = 'error' fail_list.append(assembly_file) except Exception as e: fail_list.append(assembly_file) output_file = 'error' warn = "Exception on cleanupAndWrite:" utilities.printExceptionDetails(e, warn) print() ##Blank line guideFrame.loc[i, 'CleanedFile'] = output_file guideFrame.to_csv(tempFile, index=False, sep='\t') print("Errors on {} files: ".format(len(fail_list))) print("\n\t".join(fail_list)) if process in ['DIS', 'DIS_RO']: ##recalculate stats for filtered contig sets assemblyStats2 = AssemblyStats.calculateStats( guideFrame.CleanedFile.tolist(), ass_format=assembler_name) if assemblyStats2 is not None: # assemblyStats2.rename(columns={'Filename':'CleanedFile'},inplace=True) guideFrame = AssemblyStats.BeforeAndAfter( guideFrame.set_index("CleanedFile"), assemblyStats2.set_index('Filename')) # guideFrame = pd.merge(guideFrame,assemblyStats2,on='CleanedFile',suffixes=('_raw',''),how='outer') print("Reporting stats for {} genomes.".format(len(guideFrame))) guideFrame.fillna('N/A', inplace=True) utilities.safeOverwriteTable(resultFile, guideFrame, 'tab', index=False) return 0
def calculateStats(filelist_or_frame,out_file=None,ass_format=None,image_dir=None,save_details=False): if isinstance(filelist_or_frame,list): filelist = filelist_or_frame fileframe = None elif isinstance(filelist_or_frame,pd.DataFrame): filelist = filelist_or_frame.Filename fileframe = filelist_or_frame else: raise ValueError("can only calculate stats on a list of filenames or a DataFrame with a Filename field") if len(filelist) == 0: raise ValueError("AssemblyStats CalculateStats requires a list of files with length > 0. Contact developer") assFrame = None if isinstance(image_dir,str): utilities.safeMakeDir(image_dir) if len(filelist) > 0: assemblyList = [] for filename in filelist: if isinstance(ass_format,str): assembler = ass_format elif ('spades' in filename): assembler = 'spades' print("Guessing assembler as {}".format(assembler)) elif ('skesa' in filename): assembler = 'skesa' print("Guessing assembler as {}".format(assembler)) else: assembler = None genome_format,_ = utilities.guessFileFormat(filename) AssInfo = {'Filename':filename} ##This will report data for all files provided. Junk files will have 0 contigs and 0 size if genome_format is None: AssInfo['Note']='Could not identify genome format' else: try: contig_list = seq_utilities.seqs_guess_and_parse2list(filename) if isinstance(contig_list,list) and len(contig_list) > 0: contigFrame = getContigStats(contig_list,hasQual = (genome_format == 'fastq'),assembler=assembler) if 'Coverage' in contigFrame.columns: contigFrame['Coverage'] = contigFrame['Coverage'].astype(float) ##Note: Coverage is being cast to float in getSpadesStats, but somehow becomes string in this frame. if 'Contig_Size' in contigFrame.columns: contigFrame['Contig_Size'] = contigFrame['Contig_Size'].astype(int) if save_details: contig_file = utilities.setExt(utilities.appendToFilename(filename,'_contigs'),'.xlsx') contigFrame.to_excel(contig_file) assert len(contig_list) == len(contigFrame), "Not all contigs are in dataframe" if isinstance(image_dir,str) and os.path.isdir(image_dir): if has_plt: if ('Coverage' in contigFrame.columns) and ('Contig_Size' in contigFrame.columns): tempFrame = contigFrame[['Coverage','Contig_Size']].copy() try: raw_filename = os.path.join(image_dir,os.path.basename(filename)) image_file = utilities.setExt(raw_filename, 'png') ##Note: only reason to do if isinstance(image_file,str): fig = tempFrame.plot(kind='scatter', x='Contig_Size',y='Coverage',logx=True,logy=True) fig = fig.get_figure() fig.savefig(image_file) except Exception as e: print('Failed to save contig stats scatterplot at '+image_file) for c in tempFrame.columns: print(tempFrame[c]) utilities.printExceptionDetails(e) else: try: plt.close(fig) except: print("Failed to close image...") elif assembler in ['skesa','spades']: print("Unable to produce contig stats scatterplot because necessary fields are not present ('Contig_Size' and 'Coverage')") AssInfo['Contig_Count']=str(len(contig_list)) contigSizes = contigFrame['Contig_Size'].astype(int) assemblySize = sum(contigSizes) AssInfo['Bases_In_Contigs'] = str(assemblySize) largeContigs = contigSizes > 10000 AssInfo['Large_Contig_Count'] = str(sum(largeContigs)) AssInfo['Small_Contig_Count'] = str(sum(~largeContigs)) AssInfo['Bases_In_Large_Contigs'] = str(sum(contigSizes[largeContigs])) AssInfo['Bases_In_Small_Contigs'] = str(sum(contigSizes[~largeContigs])) emptyContigs = contigSizes == 0 if sum(emptyContigs) > 0: print('\n#### WARNING #### EMPTY CONTIGS ########\n') print('\n\t'.join(contigFrame[emptyContigs].Contig_Name.tolist())) print('\n########################################\n') if 'Coverage' in contigFrame.columns: contigCoverage = contigFrame['Coverage'] ##should be float, but seems to get converted to a string with some versions if len(contigCoverage[largeContigs]) > 0: min_c = min(contigCoverage[largeContigs]) AssInfo['Min_Coverage_Large_Contigs'] = str(min_c) max_c = max(contigCoverage[largeContigs]) AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = '{:0.2f}'.format(max_c/min_c) lowC_contigs = contigFrame['Coverage'] < (min_c / 2) AssInfo['Low_Coverage_Contig_Count'] = sum(lowC_contigs) AssInfo['Low_Coverage_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size']) else: AssInfo['Min_Coverage_Large_Contigs'] = 'N/A' AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = 'N/A' AssInfo['Low_Coverage_Contig_Count'] = 'N/A' AssInfo['Low_Coverage_Contig_Bases'] = 'N/A' coverageProduct = contigFrame['Contig_Size'].astype(int) * contigFrame['Coverage'] coverageProductSum = sum(coverageProduct) meanCoverage = coverageProductSum/assemblySize AssInfo['Mean_Coverage'] = meanCoverage lowC_contigs = contigFrame['Coverage'] < (meanCoverage / 2) AssInfo['HalfCov_Contig_Count'] = sum(lowC_contigs) AssInfo['HalfCov_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size']) if feature_head in contigFrame: featureCounts = contigFrame[feature_head].astype(int) AssInfo[feature_head] = sum(featureCounts) ### Sum ambiguous nucleotides ambigCounts = contigFrame['Ambiguous_nucleotides'].astype(int) AssInfo['Ambiguous_nucleotides']=sum(ambigCounts) ## Import the quality scores for c in contigFrame.columns: if c.startswith(quality_head): AssInfo[c] = str(sum(contigFrame[c])) ##Calculate N50 and N90 N_stats = calcN50_stats(contigSizes.tolist(),thresholds=[50,75,90]) for n,size in N_stats.items(): header = "N{}".format(n) AssInfo[header] = str(size) # assemblyList.append(AssInfo) else: print("failed to parse file: "+filename) AssInfo['Note'] = 'No sequences parsed from file' except Exception as e: print("Warning: failed to assess file: " + filename) print("Exception: {}".format(e)) raise if 'Bases_In_Contigs' not in AssInfo: AssInfo['Bases_In_Contigs'] = 0 if 'Contig_Count' not in AssInfo: AssInfo['ContigCount'] = 0 assemblyList.append(AssInfo) if len(assemblyList) > 0: print("Stats for {} assemblies.".format(len(assemblyList))) assFrame = pd.DataFrame(assemblyList) if isinstance(fileframe,pd.DataFrame): saveFrame = pd.merge(fileframe,assFrame,on='Filename') else: saveFrame = assFrame.set_index('Filename') if (out_file is not None): try: saveFrame.to_csv(out_file) except Exception as e: print(saveFrame.to_csv()) print() print("Failed to print to target file {}. \nPrinted results to screen (above)".format(out_file)) utilities.printExceptionDetails(e) else: print("Failed to evaluate assemblies...") print("attempted to evaluate the following files:"+"\n".join(filelist)) return assFrame
def listReadFilesWithNames(directory,outfile = None,read_extension=None,verbose=False,doAssignReadSets=False, deep_search = True,read_codes=None,useLabID=True,target_path='/'): if read_codes is None: read_codes = df_read_codes if read_extension is None: read_extension = read_ext extRE = re.compile(re.escape(read_extension)) #All read files are compressed fastq fileList = [] readDataFile = None ##Find all read files in this directory tree abs_dir = stayOnPath(directory,target_path) print("Directory is "+abs_dir) if deep_search: for rootdir, _, files in os.walk(abs_dir): if verbose: print("Scanning {}".format(rootdir)) for filename in files: if extRE.search(filename): fileList.append(stayOnPath((os.path.join(rootdir,filename)),target_path)) else: if (rootdir == abs_dir) and (filename.endswith('.xlsx') and (not filename.startswith('~'))): if readDataFile is None: readDataFile = os.path.join(abs_dir,filename) else: print("Warning: found multiple excel files in top of directory. Not clear which is the demultiplexing file") if verbose: print("ignoring file: "+filename) else: all_files = os.listdir(abs_dir) first_file = True for filename in all_files: ##TODO refactor if extRE.search(filename): if first_file: print("\t Collecting files from directory: "+abs_dir) first_file = False fileList.append(os.path.join(abs_dir,filename)) else: if verbose: print("ignoring file: "+filename) if verbose: print("Identified {} files in {}".format(len(fileList),directory)) #### Interpret the read filenames readFrame = pairReads(fileList,read_codes=read_codes,useLabID=useLabID) #### Append any additional information if readFrame is None: print("Failed to identify read files in {}".format(directory)) else: try: readFrame['Date_Created'] = readFrame['Read1'].apply(lambda x : time.ctime(os.path.getctime(x))) except OSError as e: print("Failure to identify file creation times") utilities.printExceptionDetails(e) readFrame['Date_Ingested'] = time.ctime() if verbose: print("Returned {} read sets".format(len(readFrame))) if readDataFile is not None: ###Append data to frame if available; filename is identified during directory search, so file exists readDataFrame = openReadDataFile(readDataFile) if isinstance(readDataFrame,pd.DataFrame): readFrame = pd.merge(readFrame,readDataFrame,how='left') else: print("Error reading read data file") if outfile is not None: if os.path.isfile(outfile): priorFrame = pd.read_table(outfile) print("Appending read list to existing file: "+outfile) #~ genomeFrame.append(df,ignore_index=True) totalFrame = pd.concat([priorFrame,readFrame],ignore_index=True) else: totalFrame = readFrame print("Saving list to "+outfile) ##ToDo: I should validate that totalFrame.to_csv(outfile,sep='\t',index=False) if doAssignReadSets: assignReadSets(readFrame) return readFrame