def seq_guess_and_write(seqs, filename): seq_format, compressed = utilities.guessFileFormat(filename) if seq_format is not None: with utilities.flexible_handle(filename, compressed, 'wt') as seq_out: SeqIO.write(seqs, seq_out, seq_format) else: print("Cannot infer sequence format for file: " + filename)
def cleanupAndWrite(assembly_file,output_file,length=None,coverage=None,image_file=None,base_ID=None): ##Note: no sanity checks ## Load the assemblies assembly_format,assembly_compressed = utilities.guessFileFormat(assembly_file) output_format,output_compressed = utilities.guessFileFormat(output_file) if assembly_format != output_format: print("Warning on cleanup: input and output formats do not match ({} and {})".format(assembly_format,output_format)) with utilities.flexible_handle(assembly_file, assembly_compressed, 'rt') as fin: seqs = [c for c in SeqIO.parse(fin,assembly_format)] if base_ID is not None: new_contigs, c = seq_utilities.standardize_contig_names(seqs,base_ID) seqs = new_contigs #Precise manipulation of single contig if length is None: length = 0 if coverage is None: coverage = 0 ##always SPADES print("Removing low quality contigs from SKESA assembly. Length < {}; coverage < {}".format(length,coverage)) # raw_filename = os.path.join(os.path.dirname(report_file),os.path.basename(assembly_file)) discard_file = utilities.appendToFilename(output_file, '_discarded') ##ext is same as assembly file updated_seqs = cleanup_SKESA(seqs,minimum_length = length, minimum_coverage = coverage,discard_file=discard_file,export_contig_graph=image_file) if updated_seqs is None: print("Unable to clean and orient the assembly: \n\t"+assembly_file) return 1 else: print("Retained {} of {} contigs.".format(len(updated_seqs),len(seqs))) with open(output_file,'wt') as fout: SeqIO.write(updated_seqs,fout,output_format) print('Saved reoriented assembly at {}'.format(output_file)) if output_compressed: print("Warning. Compression not implemented. The file extension is misleading") return 0
def seqs_guess_and_parse2list(filename): seq = None seq_format, compressed = utilities.guessFileFormat(filename) if seq_format is not None: with utilities.flexible_handle(filename, compressed, 'rt') as seq_in: seq = [x for x in SeqIO.parse(seq_in, seq_format)] else: print("Cannot infer sequence format for file: " + filename) return seq
def seqs_guess_and_parse2dict(filename): if not isinstance(filename, str): raise TypeError("Filename must be string, is {}".format( type(filename))) # if not os.path.isfile(filename): # raise ValueError("Cannot locate file: {}".format(filename)) seq_dict = None seq_format, compressed = utilities.guessFileFormat(filename) if seq_format is not None: with utilities.flexible_handle(filename, compressed, 'rt') as seq_in: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_in, seq_format)) else: print("Cannot infer sequence format for file: " + filename) return seq_dict
def describeSequences(sequenceFile): result = defaultdict(int) result['FileSize'] = os.path.getsize(sequenceFile) seq_format, compressed = utilities.guessFileFormat( sequenceFile) ##guess and parse if seq_format is not None: with utilities.flexible_handle(sequenceFile, compressed, 'rt') as seq_in: for s in SeqIO.parse(seq_in, seq_format): result['Sequences'] += 1 result['Nucleotides'] += len(s) else: print("Cannot infer sequence format for file: " + sequenceFile) ##TODO: add Q30 and such? return result
def setupGenomeForBlastBasedExtraction(genome_name,genome_file,tempDir,file_format = '',is_compressed = None): ##Genome information genomeInfo = dict() genomeInfo['name'] = genome_name genomeInfo['original'] = genome_file #just for reporting ##Some people use weird genome filenames, so I need to copy it to something without special characters temp_genome = os.path.join(tempDir,genome_name + '.fasta') genomeOrganizer.exportGenomeFASTA(genome_file,temp_genome,file_format,is_compressed) genomeInfo['fasta'] = temp_genome if not os.path.isfile(genomeInfo['fasta']): raise IOError("Illegitimate file at "+genomeInfo['fasta']) #~ genomeDir,genomeFile = os.path.split(os.path.abspath(genomeInfo['fasta'])) #open the genome file for extracting sequences genome_handle = utilities.flexible_handle(genomeInfo['original'], is_compressed, 'rt') genomeInfo['seqs'] = SeqIO.to_dict(SeqIO.parse(genome_handle, file_format)) print("{} bp in {} contig(s)".format(sum([len(c) for c in genomeInfo['seqs'].values()]),len(genomeInfo['seqs']))) ##Appends to sequence identifier line if len(genomeInfo['seqs']) == 0: raise ValueError("No sequences parsed from file {}".format(genomeInfo['fasta'])) genome_handle.close() # make search database for genome db_base = os.path.basename(genomeInfo['fasta']) genomeInfo['db'] = os.path.join(tempDir,db_base) makeblastdb(genomeInfo['fasta'],genomeInfo['db']) return genomeInfo
def cleanupAndWrite(assembly_file, output_file, circle_new_start=None, reverse_contig=None, closed_circle=None, broken_circle=None, circularize_with_Ns=0, length=None, coverage=None, report_file=None, reference=None, assembler=None, working_dir=None): ##Note: no sanity checks ## Load the assemblies assembly_format, assembly_compressed = utilities.guessFileFormat( assembly_file) output_format, output_compressed = utilities.guessFileFormat(output_file) if assembly_format != output_format: print( "Warning on cleanup: input and output formats do not match ({} and {})" .format(assembly_format, output_format)) with utilities.flexible_handle(assembly_file, assembly_compressed, 'rt') as fin: seqs = [c for c in SeqIO.parse(fin, assembly_format)] #Precise manipulation of single contig updated_seqs = None if circle_new_start or reverse_contig: if len(seqs) > 1: print( "Error: User provided explicit reorientation instructions for a contig, but multiple contigs are present in assembly: \n" + assembly_file) return 1 elif closed_circle: print("Shifting closed circle...") updated_seqs = shiftCirclarChromosome(seqs[0], circle_new_start, reverse_contig, N_padding=0) elif broken_circle: print("Shifting broken circle...") updated_seqs = shiftCirclarChromosome(seqs[0], circle_new_start, reverse_contig, N_padding=-1) elif circularize_with_Ns > 0: print('Scaffolding not implemented') else: print( 'To shift a chromosome, you must specify whether the circle is closed or broken' ) else: ## Complex criteria for manipulation if closed_circle and len(seqs) > 1: print( "Warning: Untested parameters. User specified 'closed circle' but multiple contigs are present in assembly" ) ## Remove the low-quality contigs: ##TODO: consider if another parameter should be passed. At least specify if it came from SPAdes circular = closed_circle or broken_circle ##Circles imply high-quality sequence if not circular: if length is None: length = 0 if coverage is None: coverage = 0 if assembler is None: print("Removing short contigs from assembly.") updated_seqs = [x for x in seqs if len(x) > length] # if coverage elif assembler.upper() == 'SPADES': print( "Removing low quality contigs from SPADES assembly. Length < {}; coverage < {}" .format(length, coverage)) raw_filename = os.path.join(os.path.dirname(report_file), os.path.basename(assembly_file)) image_file = None # utilities.setExt(raw_filename, 'png') ##Note: this has been moved to the calculateStats routine discard_file = utilities.appendToFilename( raw_filename, '_discarded') ##ext is same as assembly file updated_seqs = cleanup_SPADES(seqs, minimum_length=length, minimum_coverage=coverage, export_contig_data=report_file, discard_file=discard_file, export_contig_graph=image_file) else: print( "Error: assembler ({}) unknown for non-circular assembly. Not attempting to cleanup contigs in file: \n{}" .format(assembler, assembly_file)) return 1 ## Reorient to reference if requested if reference: input_seqs = updated_seqs if updated_seqs is not None else seqs if os.path.isfile(reference): if circular: if len(input_seqs) > 1: print( 'Warning: multiple contigs in "circular" assembly. Only one contig will be reoriented and I cannot tell you which one. Untested.' ) if len(input_seqs) > 0: N_padding = -1 ##Do not religate if closed_circle: N_padding = 0 elif circularize_with_Ns > 0: print('Scaffolding not implemented') return 1 print( "Reorienting circular chromosome to reference...") updated_seqs = reorientClosedChromosome( input_seqs, reference, N_padding=N_padding, working_dir=working_dir ) #Note: only treated as closed if N_padding >= 0 else: ## Len == 0 print( "None of {} contigs passed your exclusion criteria. Exiting " .format(len(seqs))) return 1 else: if working_dir is None: working_dir = os.path.splitext(output_file)[0] draft_name = os.path.splitext( os.path.basename(assembly_file))[0] print("Reorienting contigs") reorder_stats = reorientContigs( input_seqs, reference, working_dir, name=draft_name, input_format=assembly_format) ##Will be genbank format if isinstance(reorder_stats, dict) and ('ReorderedDraft' in reorder_stats): updated_seqs = seq_utilities.seqs_guess_and_parse2list( reorder_stats['ReorderedDraft'] ) ##Excessive to reload... but it fits in this flow else: updated_seqs = None else: print( "Unable to realign to reference because there is no refernce file: {}" .format(reference)) if updated_seqs is None: print("Unable to clean and orient the assembly: \n\t" + assembly_file) return 1 else: with open(output_file, 'wt') as fout: SeqIO.write(updated_seqs, fout, output_format) print('Saved cleaned assembly at {}'.format(output_file)) if output_compressed: print( "Warning. Compression not implemented. The file extension is misleading" ) return 0