def main(): epi_text = ('If no NGS data directory is given, will automatically scan the following directories recursively: \n' + '\tread directory: {}'.format(default_read_dir) + '\n\tassembly directory: {}'.format(default_ass_dir)+ '\n\tMiSeq directories: {}'.format(','.join(BML_read_locations))) parser = argparse.ArgumentParser(description='A program collect information about NGS data files',epilog=epi_text) ### general info parser.add_argument('--version','-V',action='version',version='%(prog)s {}.{}'.format(SCRIPT_VERSION,SCRIPT_SUBVERSION)) # parser.add_argument('--debug',action='store_true',help="Preserve intermediate files and do not update reference files") ### controls ### required parser.add_argument('--assembly_dir','-ad',help='Directory with assemblies') parser.add_argument('--read_dir','-rd',help='Directory with reads') parser.add_argument('--out_dir','-od',help='Output directory') parser.add_argument('--MiSeq_dir','-md',help='Directory with MiSeq reads and sample sheets') parser.add_argument('--misname_file','-mf',help='Excel spreadsheet with name corrections',default=default_misname_file,type=str) # parser.add_argument('') args = parser.parse_args() out_dir = args.out_dir if args.out_dir else utilities.safeMakeOutputFolder(_outputBase) ass_out = os.path.join(out_dir,assemblies_file) read_out = os.path.join(out_dir,reads_file) Mi_out = os.path.join(out_dir,MiSeq_files) mirror_out = os.path.join(out_dir,mirrored_reads) # NCBS_out = os.path.join(out_dir,NCBS_processed) if isinstance(args.misname_file,str): if os.path.exists(args.misname_file): pass if args.read_dir or args.assembly_dir or args.MiSeq_dir: if args.read_dir: listReadFilesWithNames(args.read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True) if args.assembly_dir: listGenomeFilesWithNames(args.assembly_dir,outfile = ass_out, deep_search = True, verbose = False) if args.MiSeq_dir: listReadsFromMiSeqToplevel(args.MiSeq_dir, outfile=Mi_out, read_extension=read_ext, verbose=False, doAssignReadSets=False) else: print("\nStarting BML MiSeq reads...") df = listReadsFromMiSeqToplevel(BML_read_locations, outfile=Mi_out, read_extension=read_ext, verbose=True, doAssignReadSets=False) print("\tReported {} records".format(len(df))) print("Starting BCFB reads...") df = listReadFilesWithNames(default_read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True) print("\tReported {} records".format(len(df))) print("\nStarting BCFB assemblies...") df = listGenomeFilesWithNames(default_ass_dir,outfile = ass_out, deep_search = True, verbose = False) print("\tReported {} records".format(len(df))) ##Get the NCBS stuff print("Starting BCFB mirrored reads...") ##Note, this contains some files that were deleted from our main data directory NCBS_raw = listReadFilesWithNames(default_read_mirror,outfile = mirror_out,read_extension=read_ext,verbose=False,doAssignReadSets=True) print("\tReported {} mirrored reads".format(len(NCBS_raw)))
def __init__(self, out_directory, file_identifiers, output_basename='alleles.fasta'): assert isinstance(out_directory, str) assert isinstance(file_identifiers, set) assert isinstance(out_directory, str) self.directory = utilities.safeMakeOutputFolder( out_directory) ##Tacks on timestamp // safeMakeDir does not self.ids = file_identifiers self.basename = output_basename self.sequence_files = dict() for locus in self.ids: filename = os.path.join(self.directory, '{}_{}'.format(locus, self.basename)) self.sequence_files[locus] = utilities.checkForOverwrite( filename) ## Will not overwrite file
def __init__(self,primer_file,working_dir=None,generate_output=False): ### Make writable directories if working_dir is None: working_dir = os.getcwd() ##utilities.safeMakeOutputFolder(os.path.join(working_dir,'AmpExtTemp')) self.generate_output = generate_output self.primers_dict = read_file_to_dict(primer_file) if generate_output: self.outDir = utilities.safeMakeOutputFolder(os.path.join(working_dir,'AmpliconExtractor')) self.sequence_files = {locus: os.path.join(self.outDir,'{}_primer-extracted_sequences.fasta'.format(locus)) for locus in self.primers_dict.keys()} self.amplicon_info_file = os.path.join(self.outDir,'amplicon_information.tab') self.tempDirObj = tempfile.TemporaryDirectory(suffix='_AmpExt', prefix='tmp', dir=self.outDir) else: self.outDir = self.sequence_files = self.amplicon_info_file = None self.tempDirObj = tempfile.TemporaryDirectory(suffix='_AmpExt', prefix='tmp', dir=working_dir) self.amplicon_info_list = []
def multiple(multi_args): if multi_args.force and multi_args.resume: print( "Exiting: the options 'force' and 'resume' are incompatible. Use only 'force' if you want to overwrite prior files." ) return 1 output_dir = multi_args.output if multi_args.output else utilities.safeMakeOutputFolder( _outputBase) utilities.safeMakeDir(output_dir) logFile = os.path.join(output_dir, "AssemblyCleanup.log") resultFile = os.path.join(output_dir, "AssemblyCleanupTable.tab") tempFile = utilities.appendToFilename(resultFile, '_temp') sys.stdout = utilities.Logger(logFile) assembler_name = None if multi_args.assembler is None else multi_args.assembler.lower( ) print("Parameters:") for k, v in vars(multi_args).items(): print('{} : {}'.format(k, v)) draft_location = multi_args.draft_location if os.path.isfile(draft_location): guideFrame = pd.read_table(draft_location) print('Loaded guide table from ' + draft_location) print("\t table contains {} records".format(len(guideFrame))) elif os.path.isdir(draft_location): print("Searching for files in " + os.path.abspath(draft_location)) deep_search = False if multi_args.shallow_search_assemblies else True guideFrame = NGS_data_utilities.listGenomeFilesWithNames( draft_location, deep_search=deep_search, extension=multi_args.extension) ##Exclude reads size_limit = multi_args.size_limit if size_limit > 0: guideFrame['filesize'] = guideFrame.Filename.apply(os.path.getsize) small_enough = (guideFrame.filesize <= size_limit) if sum(small_enough) < len(guideFrame): print('Only {} of {} files pass the upper size limit of {}'. format(sum(small_enough), len(guideFrame), size_limit)) guideFrame = guideFrame[small_enough].copy() guideFrame = guideFrame[NGS_data_utilities.dfHeaders].copy() if guideFrame is None or (len(guideFrame) == 0): print("Exiting. Failed to retrieve any files") return 1 if assembler_name: guideFrame['assembler'] = assembler_name print('assigned assembler to be ' + assembler_name) else: #This is not passed to AssemblyStats for i in guideFrame.index: if 'spades' in guideFrame.loc[i, 'Filename'].lower(): guideFrame.loc[i, 'assembler'] = 'spades' print('assigned assembler to be spades for {}'.format( guideFrame.loc[i, 'Lab_ID'])) print('Calculating raw stats...') assemblyStats = AssemblyStats.calculateStats( guideFrame.Filename.tolist(), ass_format=assembler_name, image_dir=output_dir ) ##This will independently infer assembler from name unless given assemblyStats['Contig_Count'] = assemblyStats['Contig_Count'].astype( int) if assemblyStats is None or len(assemblyStats) == 0: print("Exiting failed to calculate assembly stats on input") return 1 guideFrame = pd.merge( guideFrame, assemblyStats, how='left' ) ##Should merge on Filename. Don't want confusion if they share other fields if multi_args.BCFB_PacBio_Name: print('interpreting BCFB PacBio names...') for i in guideFrame.index: guideFrame.loc[i, 'Gaps'] = False if '.ro1m.' in guideFrame.loc[ i, 'Filename'] else True else: guideFrame[ 'Gaps'] = True ### Assume no closed genomes unless stated else: print("Exiting. Unable to find the location of draft files: {}".format( draft_location)) return (1) print('Loaded data...') process = None if multi_args.reorient: process = 'RO' elif multi_args.discard: process = 'DIS' elif multi_args.discard_then_reorient: process = 'DIS_RO' else: print("Exiting. No processing specified") return (1) expectedArgs = set(['working_dir', 'report_file', 'assembler']) # circle_new_start=None,reverse_contig=None,closed_circle=None,broken_circle=None,circularize_with_Ns=0, # length=250,coverage=10,report_file=None,reference=None,assembler=None if 'RO' in process: expectedArgs.update(RO_argset) if not os.path.isfile(multi_args.reference): print("Cannot find reference file. Exiting") return 1 if 'DIS' in process: expectedArgs.update(DIS_argset) tag = multi_args.tag if multi_args.tag else process print('Result files will have the tag "{}"'.format(tag)) ##TODO test columns here permitted_fields = req_fields + list(expectedArgs) keep_fields = [x for x in guideFrame.columns if x in permitted_fields] parameterFrame = guideFrame[keep_fields].copy() if len(parameterFrame) == 0: return 1 ##Failuer fail_list = [] for i, row in parameterFrame.iterrows( ): ##Row gets converted to keyword arguments; shares index with guideFrame assembly_file = row['Filename'] if not os.path.isfile(assembly_file): print("Error: unable to find file: {}".format(assembly_file)) output_file = 'error. ' else: print("Working on " + os.path.basename(assembly_file)) print("\tat {}".format(time.ctime())) del row['Filename'] if 'Contig_Count' in row.index: if (str(row['Contig_Count']) == str(1)): gaps = row['Gaps'] gap_bool = True ##Safest default (will introduce contig breaks). But should probably skip reorientation if isinstance(gaps, str): if gaps.upper() == 'TRUE': gap_bool = True elif gaps.upper() == 'FALSE': gap_bool = False else: print("unable to interpret 'gaps' notation: {}". format(gaps)) continue elif isinstance(gaps, bool): gap_bool = gaps else: print("unable to interpret 'gaps' notation: {}".format( gaps)) continue if gap_bool: row['broken_circle'] = True ##NOTE: with our bacteria, we assume circle else: row['closed_circle'] = True del row['Gaps'] assembly_basename = utilities.appendToFilename( os.path.basename(assembly_file), '_' + tag) output_file = os.path.join(output_dir, assembly_basename) report_file = os.path.join( output_dir, os.path.basename(assembly_file)) + '.report.txt' has_out = os.path.isfile(output_file) has_rpt = os.path.isfile(report_file) if has_out or has_rpt: if multi_args.force: if has_out: print( "Removing prexisting file: {}".format(output_file)) os.remove(output_file) if has_rpt: print("Removing pre-existing file: {}".format( report_file)) os.remove(report_file) else: if not multi_args.resume: print( "Error: Refusing to overwrite pre-existing output files: \n\t{}\n\t{}" .format(output_file, report_file)) continue try: open(output_file, 'a').close() os.remove(output_file) except IOError: print( "Error. Do not have permission to write to output file \n\t{}" .format(output_file)) continue cleanup_args = vars(multi_args).copy() ##TODO: put this up front? cleanup_args.update(row.to_dict()) cleanup_args['working_dir'] = os.path.join(output_dir, 'work') cleanup_args = { k: v for k, v in cleanup_args.items() if k in expectedArgs } if 'Mean_Coverage' in row.index: proportion_cutoff = multi_args.coverage_proportion * row.loc[ 'Mean_Coverage'] min_coverage = max(multi_args.coverage, proportion_cutoff) cleanup_args['coverage'] = min_coverage del cleanup_args['Mean_Coverage'] else: cleanup_args[ 'coverage'] = multi_args.coverage ##This should actually be irrelevant -- try: print("Arguments:") print(cleanup_args) if cleanupAndWrite(assembly_file, output_file, report_file=report_file, **cleanup_args) != 0: ##TODO: return stats output_file = 'error' fail_list.append(assembly_file) except Exception as e: fail_list.append(assembly_file) output_file = 'error' warn = "Exception on cleanupAndWrite:" utilities.printExceptionDetails(e, warn) print() ##Blank line guideFrame.loc[i, 'CleanedFile'] = output_file guideFrame.to_csv(tempFile, index=False, sep='\t') print("Errors on {} files: ".format(len(fail_list))) print("\n\t".join(fail_list)) if process in ['DIS', 'DIS_RO']: ##recalculate stats for filtered contig sets assemblyStats2 = AssemblyStats.calculateStats( guideFrame.CleanedFile.tolist(), ass_format=assembler_name) if assemblyStats2 is not None: # assemblyStats2.rename(columns={'Filename':'CleanedFile'},inplace=True) guideFrame = AssemblyStats.BeforeAndAfter( guideFrame.set_index("CleanedFile"), assemblyStats2.set_index('Filename')) # guideFrame = pd.merge(guideFrame,assemblyStats2,on='CleanedFile',suffixes=('_raw',''),how='outer') print("Reporting stats for {} genomes.".format(len(guideFrame))) guideFrame.fillna('N/A', inplace=True) utilities.safeOverwriteTable(resultFile, guideFrame, 'tab', index=False) return 0
def single(args): assembly_file = args.assembly if not os.path.isfile(assembly_file): print("Exiting. Unable to find file {}".format(assembly_file)) return 1 # assembly_format,assembly_compressed = utilities.guessFileFormat(assembly_file) if args.output: output_file = args.output output_dir = os.path.dirname(output_file) else: output_dir = utilities.safeMakeOutputFolder(_outputBase) basename = utilities.appendToFilename(os.path.basename(assembly_file), '_RO') output_file = os.path.join(output_dir, basename) logFile = os.path.join(output_dir, "AssemblyCleanup.log") sys.stdout = utilities.Logger(logFile) print(_outputBase) report_file = os.path.join(output_dir, os.path.basename(assembly_file)) + '.report.txt' has_out = os.path.isfile(output_file) has_rpt = os.path.isfile(report_file) if has_out or has_rpt: if args.force: if has_out: print("Removing prexisting file: {}".format(output_file)) os.remove(output_file) if has_rpt: print("Removing pre-existing file: {}".format(report_file)) os.remove(report_file) else: print( "Exiting. Refusing to overwrite pre-existing output files: \n\t{}\n\t{}" .format(output_file, report_file)) return 1 try: open(output_file, 'a').close() except IOError: print("Exiting. Do not have permission to write to output file") return 1 ###########Should probably be a method process = None if args.reorient: process = 'RO' elif args.discard: process = 'DIS' elif args.discard_then_reorient: process = 'DIS_RO' else: print("Exiting. No processing specified") return (1) expectedArgs = set(['working_dir', 'report_file']) # circle_new_start=None,reverse_contig=None,closed_circle=None,broken_circle=None,circularize_with_Ns=0, # length=250,coverage=10,report_file=None,reference=None,assembler=None if 'RO' in process: expectedArgs.update(RO_argset) if 'DIS' in process: expectedArgs.update(DIS_argset) cleanup_args = vars(args) cleanup_args = {k: v for k, v in cleanup_args.items() if k in expectedArgs} return cleanupAndWrite(assembly_file, output_file, report_file=report_file, **cleanup_args)
def reorientClosedChromosome(raw_contig, reference_file, N_padding=-1, working_dir=None, set_steps=5, set_window=5000): temp_dir = None if isinstance(working_dir, str): try: utilities.safeMakeDir(working_dir) temp_dir = working_dir except IOError: pass ##Leave temp_dir as None if temp_dir is None: temp_dir = utilities.safeMakeOutputFolder('AssemblyCleanup_temp_') ## Setup blast database for the sequences you are searching against raw_contig_dict = SeqIO.to_dict(raw_contig) raw_contig_file = os.path.join( temp_dir, utilities.makeSafeName("-".join(raw_contig_dict.keys()))) SeqIO.write(raw_contig, raw_contig_file, 'fasta') db_name = os.path.join(temp_dir, os.path.basename(raw_contig_file)) BLASThelpers.makeblastdb(raw_contig_file) ##Get several chunks near the beginning of the reference file, as query ref_seqs = seq_utilities.seqs_guess_and_parse2list(reference_file) for rs in ref_seqs: rename = re.sub('\W', '_', rs.name) steps = set_steps window = set_window expected_search_length = steps * window - 1 if len(rs) < expected_search_length: steps = len(rs) // window if steps == 0: steps = 1 window = len(rs) search_length = steps * window print( "Reference sequence is only {}bp; dropping search sequence from {} to {}" .format(len(rs), expected_search_length, search_length)) else: search_length = expected_search_length SearchWindows = [] for i in range(0, search_length, window): end_base = i + window contig = rs[i:end_base] contig.id = 'fragment_{}_to_{}'.format(i, end_base) SearchWindows.append(contig) query_basename = rename + '_WindowsQuery.fasta' # re.sub('[^\w\s-]', '', value).strip().lower()) query_filename = os.path.join(temp_dir, query_basename) with open(query_filename, 'wt') as seq_out: SeqIO.write(SearchWindows, seq_out, 'fasta') ##Run BLAST outfile = os.path.join(temp_dir, rename + '_' + os.path.basename(db_name)) ##Note: may need to have "high stringency" and "low stringency" options. This is low stringency (for mapping to distant relatives). High stringency would increase perc_identity here and the qcovs filtering of "results" blast_cline = NcbiblastnCommandline(query=shlex.quote(query_filename), db=shlex.quote(db_name), outfmt=_outfmt_str, out=shlex.quote(outfile), evalue=1E-100, perc_identity=80, qcov_hsp_perc=25, num_threads=2) stdout = stderr = None try: stdout, stderr = blast_cline() except Exception as e: print("Blast failed on {} with {}...output below...".format( rename, reference_file)) print("\t{}".format(stdout)) print("\t{}".format(stderr)) print(e) raise results = pd.read_table(outfile, names=_outfmt_head) #No headers in file results = results[results[bh['qcovs']] > 50].sort( bh['bitscore'], ascending=False) ##Should already be sorted by bitscore full_start = full_end = 0 ##BLAST uses a 1 index first_hit = None coherent_fragments = 0 for w in SearchWindows: window_hits = results[results[bh['qseqid']] == w.id] if len(window_hits) > 0: hit = window_hits.iloc[0] start = hit[bh['sstart']] end = hit[bh['send']] contig = hit[bh['sseqid']] forward = start < end if first_hit is None: ##Serves as a sign that there was no prior hit first_hit = hit hit_contig = contig full_start = start full_end = end full_forward = forward else: ##Check that it is consistent with prior in_order = (contig == hit_contig) in_order &= full_forward == forward in_order &= (full_end < start) == full_forward in_order &= abs(end - full_end) < 2 * window if in_order: full_end = end coherent_fragments += 1 else: print("Warning: Contig {} is not in order. \nStopping". format(w.id)) break #For search windows else: print("Warning: Failed to find a match to fragment {}".format( w.id)) if coherent_fragments > 0: print('Stopping since we have an anchor already') break #For search windows if coherent_fragments > 0: print("Shifting contig {} ({} bp)".format( hit_contig, len(raw_contig_dict[hit_contig]))) new_contigs = shiftCirclarChromosome(raw_contig_dict[hit_contig], full_start, not full_forward, N_padding) del raw_contig_dict[hit_contig] for new_contig in new_contigs: raw_contig_dict[new_contig.id] = new_contig assert len( new_contig ) != 0, 'Contig with length 0. Aborting. Contact developer' print('Rotating contig: {}'.format(hit_contig)) print('Starting at {}'.format(full_start)) if full_forward: print("keeping orientation") else: print("Reverse complement") else: print( 'Aborting: Failed to identify the start position based on the reference genome.' ) print( "\t Reorient contig by specifying args.circle_new_start and/or args.reverse_contig" ) blast_results = outfile + '.tab' print('\t Saving BLAST results at ' + blast_results) results.to_csv(blast_results, sep='\t') return None return [x for x in raw_contig_dict.values()]
def main(): parser = argparse.ArgumentParser( description= 'A program to perform batched Mauve contig reordering (and someday more)' ) ### general info parser.add_argument('--version', '-V', action='version', version='%(prog)s {}.{}'.format( SCRIPT_VERSION, SCRIPT_SUBVERSION)) # parser.add_argument('-p','--projectID',help='Provide an identifier that will be added to output directory and data table') # parser.add_argument('-s','--setting_dir',help='Location of setting files') # parser.add_argument('-m','--min_cov',help='Alternate minimum coverage',default=0.8,type=float) # parser.add_argument('--debug',action='store_true',help="Preserve intermediate files and do not update reference files") ### controls parser.add_argument( '--find_mauve', action='store_true', help="Search for known Mauve directories if it is not on the path") parser.add_argument( '--search_subdirectories', action='store_true', help= "Search for draft genome files in subdirectories of specified folder") parser.add_argument( '-wd', '--working_directory', help= 'Working directory for Mauve to align assemblies. Will make a new subdirectory if not specified, starting with: ' + _outputBase) parser.add_argument( '-rd', '--result_directory', help= 'Result directory for reoriented assemblies. Will use top level of working directory if not specified.' ) ### required parser.add_argument('draft_dir', help='Location of draft assemblies') parser.add_argument('reference_genome', help='Reference genome file to orient towards') args = parser.parse_args() assert os.path.isdir(args.draft_dir), "Draft_dir is not a directory" assert os.path.isfile( args.reference_genome), "Reference genome file does not exist" working_dir = os.path.abspath( args.working_directory ) if args.working_directory else utilities.safeMakeOutputFolder( _outputBase) result_dir = os.path.abspath( args.result_directory) if args.result_directory else working_dir mh = MauveHelper(args.find_mauve) if (mh.mauve_dir is None): sys.exit("Cannot Find the Mauve path") elif not os.path.isfile(mh.mauve_dir + mauve_jar): sys.exit("Cannot Find the Mauve jar file. Searched on this path: " + mh.mauve_dir) else: reorder_stats = [] draft_genomes = NGS_data_utilities.listGenomeFilesWithNames( os.path.abspath(args.draft_dir), None, args.search_subdirectories, True) if len(draft_genomes) > 0: for draft_file in draft_genomes['Filename']: print('starting with {}'.format(draft_file)) reorder_stats.append( mh.reorder_contigs(os.path.abspath(args.reference_genome), draft_file, working_dir, result_dir)) else: print("Found no genomes. Exiting") try: statTable = pd.DataFrame(reorder_stats) statTable.to_csv( os.path.join(result_dir, "reorderStats.tab", sep='\t', index=False)) except Exception as e: print("Failure to save statistics...") print(e)
def main(): epi_text = ('If no NGS data directory is given, will automatically scan the following directories recursively: \n' + '\tread directory: {}'.format(default_read_dir) + '\n\tassembly directory: {}'.format(default_ass_dir)+ '\n\tMiSeq directories: {}'.format(','.join(BML_read_locations))) parser = argparse.ArgumentParser(description='A program collect information about NGS data files',epilog=epi_text) ### general info parser.add_argument('--version','-V',action='version',version='%(prog)s {}.{}'.format(SCRIPT_VERSION,SCRIPT_SUBVERSION)) # parser.add_argument('--debug',action='store_true',help="Preserve intermediate files and do not update reference files") ### controls ### required parser.add_argument('--assembly_dir','-ad',help='Directory with assemblies') parser.add_argument('--read_dir','-rd',help='Directory with reads') parser.add_argument('--out_dir','-od',help='Output directory') parser.add_argument('--MiSeq_dir','-md',help='Directory with MiSeq reads and sample sheets') parser.add_argument('--misname_file','-mf',help='Excel spreadsheet with name corrections',default=default_misname_file,type=str) # parser.add_argument('') args = parser.parse_args() out_dir = args.out_dir if args.out_dir else utilities.safeMakeOutputFolder(_outputBase) logFile = os.path.join(out_dir,default_logfile) print("LogFile is : "+logFile) sys.stdout = utilities.Logger(logFile) print(_outputBase) ass_out = os.path.join(out_dir,assemblies_file) read_out = os.path.join(out_dir,reads_file) Mi_out = os.path.join(out_dir,MiSeq_files) mirror_out = os.path.join(out_dir,mirrored_reads) motif_out = os.path.join(out_dir,'Motif_Extra.txt') # NCBS_out = os.path.join(out_dir,NCBS_processed) if isinstance(args.misname_file,str): if os.path.exists(args.misname_file): pass if args.read_dir or args.assembly_dir or args.MiSeq_dir: if args.read_dir: listReadFilesWithNames(args.read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True) if args.assembly_dir: listGenomeFilesWithNames(args.assembly_dir,outfile = ass_out, deep_search = True, verbose = False) if args.MiSeq_dir: listReadsFromMiSeqToplevel(args.MiSeq_dir, outfile=Mi_out, read_extension=read_ext, verbose=False, doAssignReadSets=False) else: print("\nStarting BML MiSeq reads...") df = listReadsFromMiSeqToplevel(BML_read_locations, outfile=Mi_out, read_extension=read_ext, verbose=True, doAssignReadSets=False) print("\tReported {} records".format(len(df))) print("Starting BCFB reads...") df = listReadFilesWithNames(default_read_dir,outfile = read_out,read_extension=read_ext,verbose=False,doAssignReadSets=True) print("\tReported {} records".format(len(df))) ##Assemblies and associated files print("\nStarting BCFB assemblies...") df = listGenomeFilesWithNames(default_ass_dir,outfile = ass_out, deep_search = True, verbose = False) print("\tReported {} records".format(len(df))) motif_base = df.Filename.str.rstrip('.fasta') for i in motif_base.index: f = motif_base[i] matches = glob.glob(f+"*"+motif_ext) if len(matches) > 1: print("Warning, found multiple motif files for {}. Selecting first one by glob.".format(f)) elif len(matches) == 1: motif_base[i] = matches[0] # motif_files = df.Filename.str.rstrip('.fasta') + motif_ext motif_exists = motif_base.apply(os.path.isfile) print("\tIdentified {} associated motif files".format(sum(motif_exists))) if sum(motif_exists) > 0: df.loc[motif_exists,'Motif_Data'] = motif_base.loc[motif_exists] basepath = df.Filename.str.extract(r'(^.*\/[^.]*\.[^.]*)[^/]')#,expand=False) summary_files = basepath + '.summary' summary_exists = summary_files.apply(os.path.isfile) if sum(summary_exists) >0: df.loc[summary_exists,'BLAST_summary'] = summary_files.loc[summary_exists] circlator_files = basepath + '.circlator.json' circlator_exists = circlator_files.apply(os.path.isfile) if sum(circlator_exists) >0: df.loc[circlator_exists,'Circulator'] = circlator_files.loc[circlator_exists] df.to_csv(ass_out,sep='\t',index=False) ## ovirwrite file from listGenomesFileWithNames ##Get all motif for comparison motif_list = [] for rootdir, _, files in os.walk(default_ass_dir): motif_list += [os.path.join(rootdir,x) for x in files if x.endswith(motif_ext)] extra_motif = [x for x in motif_list if x not in df.Motif_Data.tolist()] if len(extra_motif) > 0: print("Found extra motif files. Saving list to {}".format(motif_out)) with open(motif_out,'wt') as fout: for f in extra_motif: print(f,file=fout) ##Get the NCBS stuff print("Starting BCFB mirrored reads...") ##Note, this contains some files that were deleted from our main data directory NCBS_raw = listReadFilesWithNames(default_read_mirror,outfile = mirror_out,read_extension=read_ext,verbose=False,doAssignReadSets=True) print("\tReported {} mirrored reads".format(len(NCBS_raw)))