def run(self): base = os.path.split(self.prefix)[1] annotations = [ ] sequences = [ ] for filename in self.filenames: any = False if io.is_sequence_file(filename): sequences.append(filename) any = True if annotation.is_annotation_file(filename): annotations.append(filename) any = True assert any, 'File is neither a recognized sequence or annotation file' cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt') property_filename = os.path.join(self.prefix,'property.txt') gff_filename = os.path.join(self.prefix,base+'.gff') output_filenames = [ cytoband_filename, property_filename, gff_filename ] if not os.path.exists(self.prefix): os.mkdir(self.prefix) f = open(property_filename,'wb') print >> f, 'ordered=true' print >> f, 'id=%s' % base print >> f, 'name=%s' % (self.name or base) print >> f, 'cytobandFile=%s_cytoband.txt' % base print >> f, 'geneFile=%s.gff' % base print >> f, 'sequenceLocation=%s' % base f.close() trivia.As_gff(output=gff_filename, filenames=annotations, exclude=[ 'gene', 'source' ] ).run() f_cyt = open(cytoband_filename,'wb') for filename in sequences: for name, seq in io.read_sequences(filename): assert '/' not in name f = open(os.path.join(self.prefix, name + '.txt'), 'wb') f.write(seq) f.close() print >> f_cyt, '%s\t0\t%d' % (name, len(seq)) f_cyt.close() genome_filename = self.prefix + '.genome' if os.path.exists(genome_filename): os.unlink(genome_filename) io.execute( ['zip', '-j', io.abspath(genome_filename)] + [ io.abspath(item) for item in output_filenames ] ) for filename in output_filenames: if os.path.exists(filename): os.unlink(filename)
def open_bam(filename): process = io.run([ 'samtools', 'view', '-h', io.abspath(filename), ]) return process.stdout
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost=self.snp_cost) # assert os.path.exists(self.reference), 'Reference file does not exist' # reference_filename = workspace._object_filename('reference.fa') # if os.path.exists(reference_filename): # os.unlink(reference_filename) # os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, "temp.bam") sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ["samtools", "view", "-S", "-b", "-"]) f = open(self.input, "rb") while True: data = f.read(1 << 20) if not data: break writer.write(data) writer.close() f.close() grace.status("Sort") # io.execute([ # 'samtools', 'sort', '-n', sort_input_filename, bam_prefix # ]) sam.sort_bam(sort_input_filename, bam_prefix, by_name=True) if temp_filename is not None: os.unlink(temp_filename) grace.status("")
def bam_headers(filename): process = io.run([ 'samtools', 'view', '-H', io.abspath(filename), ]) headers = process.stdout.read() assert process.wait() == 0, '"samtools view -H ..." failed' return headers
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost = self.snp_cost) #assert os.path.exists(self.reference), 'Reference file does not exist' #reference_filename = workspace._object_filename('reference.fa') #if os.path.exists(reference_filename): # os.unlink(reference_filename) #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, 'temp.bam') sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-']) f = open(self.input, 'rb') while True: data = f.read(1<<20) if not data: break writer.write(data) writer.close() f.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', sort_input_filename, bam_prefix #]) sam.sort_bam(sort_input_filename, bam_prefix, by_name=True) if temp_filename is not None: os.unlink(temp_filename) grace.status('')
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost = self.snp_cost) #assert os.path.exists(self.reference), 'Reference file does not exist' #reference_filename = workspace._object_filename('reference.fa') #if os.path.exists(reference_filename): # os.unlink(reference_filename) #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, 'temp.bam') sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-']) f = open(self.input, 'rb') while True: data = f.read(1<<20) if not data: break writer.write(data) writer.close() f.close() grace.status('Sort') io.execute([ 'samtools', 'sort', '-n', sort_input_filename, bam_prefix ]) if temp_filename is not None: os.unlink(temp_filename) grace.status('')
def __init__(self, filename): assert os.path.exists(filename), filename + ' does not exist' if is_bam(filename): self.process = io.run([ 'samtools', 'view', io.abspath(filename), ]) ## Godawful hack #self.process.stdout = io.process_buffer(self.process.stdout) self.file = self.process.stdout else: self.process = None self.file = io.open_possibly_compressed_file(filename)
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E': None, '-T': None, '-N': str(cores), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames)) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',', '_'), workspace.name.replace(',', '_')) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def absolutize(filename): if options.prefix is not None: return options.prefix + filename else: return io.abspath(filename)
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, "No reference sequences given" assert self.reads or self.pairs or self.interleaved, "No reads given" for pair in self.pairs: assert len(pair) == 2, "Two files required in each pair: section" io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) # Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { "-E": None, "-T": None, "-N": str(cores), "-n": "2", "-w": "200%", "-p": "opp-in", "-I": "0,500", "-X": None, } if self.sam_unaligned: default_options["--sam-unaligned"] = None if self.half_paired: default_options["--half-paired"] = None else: default_options["--no-half-paired"] = None cutoff = "55%" # Default changed in SHRiMP 2.0.2 if "-h" in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1] # Run shrimp bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted") temp_filename = io.abspath(self.output_dir, "temp.bam") log_filename = io.abspath(self.output_dir, "shrimp_log.txt") log_file = open(log_filename, "wb") sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith("@"): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status("%s alignments produced" % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ["-p", "-I"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2 :] for flag in ["--half-paired"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1 :] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames # A little ugly ) if has_qualities: options.append("--fastq") if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ["-1", filenames[0], "-2", filenames[1]] if "--qv-offset" not in self.shrimp_options: guesses = [] for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert ( len(set(guesses)) == 1 ), "Conflicting quality offset guesses, please specify --qv-offset manually." default_options["--qv-offset"] = str(guesses[0]) default_options["--read-group"] = "%s,%s" % ( workspace.name.replace(",", "_"), workspace.name.replace(",", "_"), ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status("") full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >>sys.stderr, "Running", " ".join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status("Sort") io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status("")
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [ ] for item in self.reads: read_sets.append( ([item], False) ) for item in self.pairs: read_sets.append( (item, True) ) for item in self.interleaved: read_sets.append( ([item], True) ) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E' : None, '-T' : None, '-N' : str(cores), '-n':'2', '-w':'200%', '-p': 'opp-in', '-I': '0,500', '-X':None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p','-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len( io.read_sequences(filename, qualities=True).next() ) == 3 #A little ugly for filename in filenames ) if has_qualities: options.append( '--fastq' ) if len(filenames) == 1: reads_parameters = [ filenames[0] ] else: reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames) ) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',','_'), workspace.name.replace(',','_') ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' read_sets = [ ] for item in self.reads: read_sets.append( ([item], False) ) for item in self.pairs: read_sets.append( (item, True) ) for item in self.interleaved: read_sets.append( ([item], True) ) default_options = { '-E' : None, '-T' : None, '-N' : str(grace.how_many_cpus()), '-n':'2', '-w':'200%', '-p': 'opp-in', '-I': '0,500', '-X':None } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ] #Create working directory workspace = self.get_workspace() #working_directory.Working(self.output_dir, must_exist=False) workspace.setup_reference(self.references) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() #workspace = io.Workspace(self.output_dir) # #workspace.update_param( # shrimp_cutoff = cutoff #) # ##Make copy of reference sequences # #reference_filename = io.abspath(self.output_dir,'reference.fa') #reference_file = open(reference_filename,'wb') # #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk') #reference_genbank_file = open(reference_genbank_filename,'wb') #any_genbank = [ False ] # #def genbank_callback(name, record): # """ Make a copy of any genbank files passed in. """ # from Bio import SeqIO # # SeqIO.write([record], reference_genbank_file, 'genbank') # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.gbk' # ), 'wb') # SeqIO.write([record], f, 'genbank') # f.close() # # any_genbank[0] = True # #for filename in self.references: # for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback): # #Don't retain any comment # name = name.split()[0] # io.write_fasta(reference_file, name, sequence.upper()) # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.fa' # ), 'wb') # io.write_fasta(f, name, sequence.upper()) # f.close() # # #reference_file.close() #reference_genbank_file.close() #if not any_genbank[0]: # os.unlink(reference_genbank_filename) # ## Create an index of the reference sequences #io.execute([ # 'samtools', 'faidx', reference_filename #]) #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) #if self.cs: # program = 'gmapper-cs' #else: # program = 'gmapper-ls' sam_header_sent = [False] n_seen = [0] def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p','-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos+1:] return options if '--qv-offset' not in self.shrimp_options: guesses = [ ] for filenames, is_paired in read_sets: for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' default_options['--qv-offset'] = str(guesses[0]) for filenames, is_paired in read_sets: options = self.shrimp_options[:] has_qualities = all( len( io.read_sequences(filename, qualities=True).next() ) == 3 #A little ugly for filename in filenames ) if has_qualities: options.append( '--fastq' ) # temp_read_filename = io.abspath(working_dir, 'temp.fa') #else: # temp_read_filename = io.abspath(working_dir, 'temp.fq') #try: #if len(filenames) == 1: # gmapper can cope with gzipped and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'): # actual_read_filename = filenames[0] #else: # actual_read_filename = temp_read_filename # grace.status('Copying reads') # f = open(temp_read_filename, 'wb') # if has_qualities: # for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]): # for name, seq, qual in reads: # io.write_fastq(f, name, seq, qual) # else: # for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]): # for name, seq in reads: # io.write_fasta(f, name, seq) # f.close() # grace.status('') if len(filenames) == 1: reads_parameters = [ filenames[0] ] else: reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ] for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file) eat(p) #finally: # if os.path.exists(temp_read_filename): # os.unlink(temp_read_filename) log_file.close() sam_eater.close() grace.status('Sort') io.execute([ 'samtools', 'sort', '-n', temp_filename, bam_prefix ]) os.unlink(temp_filename) grace.status('')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) default_options = { '-E': None, '-T': None, '-N': str(grace.how_many_cpus()), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Create working directory workspace = self.get_workspace( ) #working_directory.Working(self.output_dir, must_exist=False) workspace.setup_reference(self.references) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() #workspace = io.Workspace(self.output_dir) # #workspace.update_param( # shrimp_cutoff = cutoff #) # ##Make copy of reference sequences # #reference_filename = io.abspath(self.output_dir,'reference.fa') #reference_file = open(reference_filename,'wb') # #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk') #reference_genbank_file = open(reference_genbank_filename,'wb') #any_genbank = [ False ] # #def genbank_callback(name, record): # """ Make a copy of any genbank files passed in. """ # from Bio import SeqIO # # SeqIO.write([record], reference_genbank_file, 'genbank') # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.gbk' # ), 'wb') # SeqIO.write([record], f, 'genbank') # f.close() # # any_genbank[0] = True # #for filename in self.references: # for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback): # #Don't retain any comment # name = name.split()[0] # io.write_fasta(reference_file, name, sequence.upper()) # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.fa' # ), 'wb') # io.write_fasta(f, name, sequence.upper()) # f.close() # # #reference_file.close() #reference_genbank_file.close() #if not any_genbank[0]: # os.unlink(reference_genbank_filename) # ## Create an index of the reference sequences #io.execute([ # 'samtools', 'faidx', reference_filename #]) #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) #if self.cs: # program = 'gmapper-cs' #else: # program = 'gmapper-ls' sam_header_sent = [False] n_seen = [0] def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options if '--qv-offset' not in self.shrimp_options: guesses = [] for filenames, is_paired in read_sets: for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' default_options['--qv-offset'] = str(guesses[0]) for filenames, is_paired in read_sets: options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') # temp_read_filename = io.abspath(working_dir, 'temp.fa') #else: # temp_read_filename = io.abspath(working_dir, 'temp.fq') #try: #if len(filenames) == 1: # gmapper can cope with gzipped and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'): # actual_read_filename = filenames[0] #else: # actual_read_filename = temp_read_filename # grace.status('Copying reads') # f = open(temp_read_filename, 'wb') # if has_qualities: # for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]): # for name, seq, qual in reads: # io.write_fastq(f, name, seq, qual) # else: # for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]): # for name, seq in reads: # io.write_fasta(f, name, seq) # f.close() # grace.status('') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file) eat(p) #finally: # if os.path.exists(temp_read_filename): # os.unlink(temp_read_filename) log_file.close() sam_eater.close() grace.status('Sort') io.execute(['samtools', 'sort', '-n', temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status('')