def test_fasta_get_sequence(self): f = parser.FastaIterator(self.handle, index=self.index) out = f.get_sequence('c1', 5, 30) digest = hashlib.sha224(out).hexdigest() self.assertEqual( 'ddb5a96ada0f651bffeb8ef856c76faf610ca669a68be904b0acb8b8', digest, "Fasta get_sequence #1 Failure") f.fasta_file.close()
def test_fasta_index_build(self): f = parser.FastaIterator(self.handle) f.build_fasta_index() out = '\n'.join([row.strip() for row in open(self.index, 'rb')]) digest = hashlib.sha224(out).hexdigest() self.assertEqual( 'e071a4ec04e59d55231dc667e06b81b17d96fad0d40fe2ac883e9fe3', digest, "Fasta Index Build Failure")
def main(): args = parser.parse_args() file_name = args.fasta vcf = args.vcf snps = args.no_snps dels = args.dels ins = args.ins homs = args.no_homozygous hets = args.heterozygous individual = args.individual-1 fasta_file = fasta.FastaIterator(file_name) vcf_file = gp.VCFIterator( vcf ) #store our vcf file first entries = {} to_append = 'chr' if args.append_chromosome else '' for info in vcf_file: checked = False valid_variant = False if homs: if info.is_homozygous()[individual]: if ((snps and not info.has_snp(individual=individual)) and (dels and not info.has_deletion(individual=individual)) and (ins and not info.has_insertion(individual=individual))): checked = True continue valid_variant = True try: entries['%s%s' % (to_append,info.chrom)][int(info.pos)-1] = info except KeyError: entries['%s%s' % (to_append,info.chrom)] = {int(info.pos)-1: info} if hets: if info.is_heterozygous()[individual]: if ((not valid_variant and not checked) and (snps and not info.has_snp(individual=individual)) and (dels and not info.has_deletion(individual=individual)) and (ins and not info.has_insertion(individual=individual))): continue try: entries['%s%s' % (to_append,info.chrom)][int(info.pos)-1] = info except KeyError: entries['%s%s' % (to_append,info.chrom)] = {int(info.pos)-1: info} with args.out as o: for header, sequence in fasta_file: d = entries.get(header, None) if d: bases = d.keys() bases.sort(reverse=True) sequence = list(sequence) #we go from the back of the sequence so we don't have to bother #with offsets if we are inserting/deleting bases as well for i in bases: var_info = d[i] ref = var_info.ref alt = var_info.get_alt(individual=individual)[0] # sys.stderr.write('swapping %s with %s\n' % (ref,alt)) sequence[i:i+len(ref)] = list(alt) sequence = ''.join(sequence) o.write('>%s\n%s\n' % (header, sequence))
def test_fasta_iterator(self): out = "" f = parser.FastaIterator(self.handle, delimiter='>') assert (isinstance(f, parser.FastaIterator)) for header, sequence in f: out += ">%s\n%s\n" % (header, sequence) digest = hashlib.sha224(out).hexdigest() self.assertEqual( 'a4b6987095e97824cbcb36674f9757c4ccfad161eeb9fd8a993e851a', digest, "Fasta Iterator Failure")
def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands no_met = args.no_met_start from_met = args.from_met from_met_keep = args.from_met_keep if from_met_keep: from_met = True no_met = True def write_sequence(handle, header, protein_index, protein_sequence): header1 = '>%s F:%s%d Orf:%d' % (header, strand, i + 1, protein_index + 1) protein_sequences = [(header1, protein_sequence)] if from_met: pos = protein_sequence.find('M') if pos == -1: return header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % ( header, pos, strand, i + 1, protein_index + 1) protein_sequences.append((header2, protein_sequence[pos:])) for protein_header, protein_sequence in protein_sequences: if len(protein_sequence) >= orf_min and ( no_met or protein_sequence[0] == 'M'): handle.write('%s\n%s\n' % (protein_header, protein_sequence)) with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand = '+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index, protein_sequence in enumerate(translation): write_sequence(o, header, protein_index, protein_sequence) if negative_strand: strand = '-' translation = fasta._translate( fasta._reverse_complement(sequence)[i:]) for protein_index, protein_sequence in enumerate( translation): write_sequence(o, header, protein_index, protein_sequence)
def test_protein_digestion(self): out = "" f = parser.FastaIterator(self.handle, delimiter='>') assert (isinstance(f, parser.FastaIterator)) enzyme = digest.Enzyme(enzyme='trypsin') assert (isinstance(enzyme, digest.Enzyme)) for __, sequence in f: out += sequence peptides = ''.join(enzyme.cleave(out, min=7, max=30)) hash_sum = hashlib.sha224(peptides).hexdigest() self.assertEqual( '31c6612b85dcea10c26e35826f4e5577b674624725477eb5202b18bb', hash_sum, "Protein Digestion With Trypsin Failure") enzyme = digest.Enzyme(enzyme='lysc') peptides = ''.join(enzyme.cleave(out, min=0, max=9999, unique=True)) hash_sum = hashlib.sha224(peptides).hexdigest() self.assertEqual( '2b5e17ce606e9a296095d8b4b9cf75d44ba662d5eb3531e0a187def4', hash_sum, "Unique Protein Digestion with Lys-C Failure")
def main(): global protein_sequences global fasta_headers global il_convert peptides_mapped = Value('i', 0) args = parser.parse_args() cores = args.p fasta_file = fasta.FastaIterator(args.fasta) peptide_column = args.peptide_col try: peptide_index = int(peptide_column) - 1 peptide_column = peptide_index except ValueError: peptide_index = None tsv_file = args.tsv il_convert = not args.no_equality out_file = args.out header_lines = args.header delimiter = args.delimiter inference = not args.no_inference inferred_name = args.inferred_name digest_min = args.min digest_max = args.max normalize = args.normalize ibaq = args.ibaq ibaq_redunant = not args.non_redundant case_sens = args.case_sensitive mod_site = args.modification_site unique = args.unique_only out_position = args.position if mod_site: case_sens = True inference = True precursor_columns = [i for i in args.precursors.split(',') ] if args.precursors else None if ibaq: enzyme = digest.Enzyme(enzyme=args.enzyme[0] if isinstance( args.enzyme, list) else args.enzyme) sys.stderr.write("Reading in Fasta file.\n") fasta_headers, protein_sequences = zip( *[(header.replace(';', ''), sequence) for header, sequence in fasta_file]) #replace headers with parsed ones if args.regex: regex = re.compile(args.regex) fasta_headers = [regex.search(header) for header in fasta_headers] protein_sequences = [ protein_sequences[i] for i, v in enumerate(fasta_headers) if v ] fasta_headers = [' '.join(i.groups()) for i in fasta_headers if i] sys.stderr.write( '{0} header sequences did not match regex {1} and have been discarded.\n' .format(len(fasta_headers) - len(protein_sequences), args.regex)) if ibaq: ibaq_protein_sequence = { header: sequence for header, sequence in zip(fasta_headers, protein_sequences) } cleaved = {} protein_sequences = '\n'.join([ '{}\t{}'.format(header, sequence) for header, sequence in zip(fasta_headers, protein_sequences) ]) # protein_sequences = '\n'.join(protein_sequences) peptide_history = {} mod_grouping = {} # ordered by protein, site, type pep_count = 0 pep_set = set([]) mod_col = args.mod_col correlator = ColumnFunctions(args) motif_search = args.motifs if motif_search: motif_window = args.motif_window motif_unique = args.motif_unique if args.motif_out: motif_out = open(args.motif_out, 'wb') elif args.out: motif_out = open('{0}_motif'.format(args.out.name), 'wb') else: sys.stderr.write( "You must provide an output name for motif-out if you are piping to stdout.\n" ) return -1 mod_col_func = getattr(correlator, args.mod_col_func, correlator.concat) ibaq_col_func = getattr(correlator, args.ibaq_function, correlator.concat) with tsv_file as f: reader = csv.reader(f, delimiter=delimiter) for line_num, entry in enumerate(reader): if line_num < header_lines: # we assume the first header line is the one we care about if peptide_index is None: for i, v in enumerate(entry): if v.lower() == args.peptide_col.lower(): peptide_column = i break if mod_col is not None and mod_col.isdigit(): mod_col = int(mod_col) - 1 elif mod_col is not None: for i, v in enumerate(entry): if v.lower() == args.mod_col.lower(): mod_col = i if not precursor_columns: precursor_columns = [ i for i, v in enumerate(entry) if 'precursor' in v.lower() ] try: precursor_columns = [int(i) for i in precursor_columns] except ValueError: precursor_columns = [ entry.index(i) for i in precursor_columns ] normalizations = [0 for i in precursor_columns] else: peptide = entry[peptide_column] pep_count += 1 if not case_sens: peptide = peptide.upper() pep_set.add(peptide) if peptide not in peptide_history: peptide_history[peptide] = { 'intensities': dict([(i, set([])) for i in xrange(len(precursor_columns))]) if precursor_columns is not None else {}, } if precursor_columns: for n_i, e_i in enumerate(precursor_columns): if entry[e_i]: try: intensity = decimal.Decimal(entry[e_i]) except decimal.InvalidOperation: intensity = decimal.Decimal(0) peptide_history[peptide]['intensities'][n_i].add( intensity) if mod_col is not None: peptide_history[peptide]['mod_col'] = entry[mod_col] if ibaq and normalize and precursor_columns: for peptide in peptide_history: for i, v in peptide_history[peptide]['intensities'].iteritems( ): normalizations[i] += sum(v) else: normalizations = [decimal.Decimal(1) for i in normalizations] # map our peptides is a multi-cored manner pool = Pool(cores) # get our matches peptides = list(set([i.upper() for i in peptide_history.keys()])) # break into groups of 100 (empirically gives fastest mapping) subpeptides = [ peptides[n:n + peptides_per_core] for n in xrange(0, len(peptides), peptides_per_core) ] if n < len(peptides): subpeptides.extend(peptides[n + peptides_per_core:]) num_peps = len(peptides) progress_finish() sys.stderr.write('Mapping Peptides.\n') results = pool.map_async(mapper, subpeptides) results.wait() mapped_peptides = dict( (k, v) for d in results.get() for (k, v) in d.items()) sys.stderr.write('\nPeptides mapped.\n') protein_grouping = {} peptide_grouping = {} stats = {'peptides': pep_count} stats['peptides_found'] = len(pep_set) proteins_mapped = set([]) peptide_out = [] empty_dict = { 'proteins': '', 'positions': [], 'accessions': [], 'matches': [], 'unique': True } for index, (peptide, d) in enumerate(peptide_history.iteritems()): try: peptide_dict = peptide_grouping[peptide] except KeyError: peptide_dict = {'intensities': {}} peptide_grouping[peptide] = peptide_dict if not index % 100: progress_update(index, len(peptide_history)) mapped_info = mapped_peptides.get(peptide.upper(), empty_dict) precursor_int = float( sum([sum(d['intensities'][i]) for i in d['intensities']])) entry = [ peptide, sum([len(d['intensities'][i]) for i in d['intensities']]), precursor_int ] if 'inference' not in peptide_dict: peptide_dict['inference'] = {'proteins': ''} if inference: proteins = mapped_info['proteins'] accessions = mapped_info['accessions'] start_positions = mapped_info['positions'] if mod_site else [] proteins_mapped |= set(proteins) if unique: proteins = make_unique(proteins) if len(proteins) > 1: mapped_info['unique'] = False matches = ';'.join(proteins) peptide_dict['inference']['proteins'] = matches if not unique or mapped_info['unique']: entry.append(matches) else: entry.append('') for protein_index, protein in enumerate(proteins): try: protein_grouping[protein][peptide] = d except KeyError: protein_grouping[protein] = {peptide: d} if mod_site: mod_site_additions = [] motifs_found = {} find_motif = False if motif_search and (len(proteins) == 1 or not motif_unique): find_motif = True for start_position, protein in zip(start_positions, accessions): mod_site_addition = [] for j, k in enumerate(peptide): if k.islower(): mod_pos = start_position + j mod_key = '%s:%d' % (k, mod_pos) if find_motif: motif_sequences = [ protein_sequences[i + j - motif_window:i + j + motif_window + 1] for i in mapped_info['matches'] ] motif_pos = motif_window # remove any newlines to the left of us for motif_sequence in motif_sequences: cut = motif_sequence[:motif_pos].rfind( '\t') if cut != -1: motif_sequence = motif_sequence[ cut + 1:] motif_pos -= (cut + 1) cut = motif_sequence[motif_pos + 1:].rfind('\t') if cut != -1: motif_sequence = motif_sequence[: motif_pos + cut] found = motifs_found.get(mod_key, []) motifs_found[mod_key] = make_unique( found + [motif_sequence]) mod_site_addition.append(mod_key) if mod_col or mod_site: try: mod_values = mod_grouping[protein][ mod_key]['values'] mod_peptides = mod_grouping[protein][ mod_key]['peptides'] if mod_col: mod_values.append(d['mod_col']) mod_grouping[protein][mod_key][ 'values'] = make_unique(mod_values) mod_grouping[protein][mod_key][ 'peptides'] = make_unique( mod_peptides + [peptide]) except KeyError: try: mod_grouping[protein][mod_key] = { 'values': make_unique([d['mod_col']]) if mod_col else '', 'peptides': make_unique([peptide]) } except KeyError: mod_grouping[protein] = { mod_key: { 'values': make_unique([d['mod_col']]) if mod_col else '', 'peptides': make_unique([peptide]) } } mod_site_additions.append( '%s(%s)' % (protein, ','.join(mod_site_addition))) peptide_dict['inference']['mod_sites'] = ';'.join( mod_site_additions) peptide_dict['inference']['motifs'] = motifs_found peptide_dict['inference']['matched_positions'] = ','.join( str(i) for i in start_positions) if ibaq: ibaqs = [] intensities = [sum(d['intensities'][i]) for i in d['intensities']] try: precursor_int = sum([ intensities[i] / normalizations[i] for i in xrange(len(normalizations)) ]) except decimal.InvalidOperation: precursor_int = 0 entry.append(precursor_int) for protein_index in mapped_info['accessions']: peptides = cleaved.get(protein_index, None) if peptides is None: if ibaq_redunant: peptides = sum([ len( enzyme.cleave( ibaq_protein_sequence[protein_accession], min=digest_min, max=digest_max)) for protein_accession in mapped_info['accessions'] ]) else: peptides = len( set([ peptide for tryptic_peptides in [ enzyme.cleave(ibaq_protein_sequence[ possible_protein_index], min=digest_min, max=digest_max) for possible_protein_index in mapped_info['indices'] ] for peptide in tryptic_peptides ])) cleaved[protein_index] = peptides if not peptides: ibaqs.append(0) continue # this divides the precursor intensity of the given peptide by the number of theoretically # possible cleaved peptides per protein. # If the user is grouping things at a higher level, say the gene level this will output the ibaq # per each mapped isoform if that gene has isoforms. # if peptide.upper() == 'HMSFHAHVR': # import pdb; pdb.set_trace(); ibaqs.append(precursor_int / peptides if peptides and precursor_int else 0) peptide_dict['inference']['iBAQ'] = ibaq_col_func( [int(IBAQ_NORMALIZATION * i) for i in ibaqs]) if ibaqs else 0 entry.append(peptide_dict['inference']['iBAQ'] if not unique or mapped_info['unique'] else '') if out_position: entry.append(peptide_dict['inference'].get('matched_positions', '') if not unique or mapped_info['unique'] else '') if mod_site: entry.append(peptide_dict['inference'].get('mod_sites', '') if not unique or mapped_info['unique'] else '') if motif_search: entry.append(';'.join([ '{}({})'.format(motif_site, ';'.join(motifs)) for motif_site, motifs in peptide_dict['inference'].get( 'motifs', {}).iteritems() ])) peptide_out.append(entry) progress_finish() with args.peptide_out as o: writer = csv.writer(o, delimiter=delimiter) header = ['Peptide', 'PSMS', 'Total Precursor Area'] if inference: header.append(inferred_name) if ibaq: if normalize: header.append('Normalized Precursor Intensity') header.append('iBAQ') if out_position: header.append('Peptide %s Position' % inferred_name) if mod_site: header.append('Modification Positions') if motif_search: header.append('Motif') writer.writerow(header) for i in peptide_out: writer.writerow(i) if motif_search: with motif_out as o: writer = csv.writer(o, delimiter=delimiter) header = ['Residue', 'Motif'] if inference: header.append(inferred_name) writer.writerow(header) for peptide, peptide_dict in peptide_grouping.iteritems(): for motif_key, motifs in peptide_dict['inference'].get( 'motifs', {}).iteritems(): writer.writerow([ motif_key, ';'.join(motifs), peptide_dict['inference']['proteins'] ]) stats['proteins_mapped'] = len(proteins_mapped) if inference: with args.protein_out as o: writer = csv.writer(o, delimiter=delimiter) header = [inferred_name, 'Peptides', 'Total Precursor Area'] if mod_site: header.append('Modification Positions') if ibaq: if normalize: header.append('Normalized Precursor Intensity') header.append('iBAQ') writer.writerow(header) for protein in protein_grouping: entry = [protein] intensities = [] precursor_int = 0 peptide_psm_count = [] mods = set([]) for peptide in protein_grouping[protein]: if mod_site: peptide_dict = peptide_grouping.get(peptide, False) if peptide_dict: mod_proteins = peptide_dict['inference'][ 'mod_sites'] for mod_protein in mod_proteins.split(';'): #mod protein looks like: #WBGene00004829(y:467,k:471);WBGene00019361(m:68);WBGene00019361(m:118);WBGene00019361(m:68);WBGene00020808(m:261);WBGene00020808(m:156) mod_prots = mod_protein.split(';') for mod_prot_ in mod_prots: mod_prot, mod_prot_sites = mod_prot_.rsplit( '(', 1) if mod_prot == protein: for mod_prot_site in mod_prot_sites[:-1].split( ','): if mod_prot_site: mod_aa, mod_prot_site = mod_prot_site[:-1].split( ':') mods.add( (mod_aa, mod_prot_site)) d = protein_grouping[protein][peptide] if not unique or mapped_peptides.get(peptide, {}).get('unique'): peptide_psm_count.append((peptide, sum([ len(d['intensities'][i]) for i in d['intensities'] ]))) intensities += [ sum(d['intensities'][i]) for i in d['intensities'] ] if ibaq and normalize: try: precursor_int += sum([ intensities[i] / normalizations[i] for i in xrange(len(normalizations)) ]) except decimal.InvalidOperation: pass entry.append(';'.join( ['%s(%s)' % (i, j) for i, j in peptide_psm_count])) entry.append(sum(intensities)) if mod_site: mods = list(mods) mods.sort(key=lambda x: x[1]) entry.append(';'.join(['%s%s' % (i, j) for i, j in mods])) if ibaq: if normalize: entry.append(precursor_int) peptides = cleaved.get(protein_index, None) ibaq_value = [ int(IBAQ_NORMALIZATION * precursor_int / peptides) if peptides and precursor_int else 0 ] entry.append(ibaq_col_func(ibaq_value)) writer.writerow(entry) tsv_file = open(tsv_file.name) with tsv_file as f: reader = csv.reader(f, delimiter=delimiter) mod_stats = {} with out_file as o: out_writer = csv.writer(o, delimiter=delimiter) total_mods = Counter() for line_num, entry in enumerate(reader): if line_num < header_lines: #we assume the first header line is the one we care about if inference: entry.append(inferred_name) if out_position: entry.append('Peptide %s Position' % inferred_name) if mod_site: entry.append('Modification Position') if ibaq: entry.append('iBAQ') else: peptide = entry[peptide_column] if not case_sens: peptide = peptide.upper() d = peptide_grouping.get(peptide, False) total_mods.update([k for k in peptide if k.islower()]) if d: if inference: entry.append( d['inference']['proteins'] if not unique or mapped_peptides. get(peptide, {}).get('unique') else '') if out_position: entry.append( d['inference']['matched_positions'] if not unique or mapped_peptides. get(peptide, {}).get('unique') else '') if mod_site: mod_proteins = d['inference']['mod_sites'] peptide_mods = {} mod_entry = [] if not unique or mapped_peptides.get( peptide, {}).get('unique'): for mod_protein in mod_proteins.split(';'): #mod protein looks like: mod_prots = mod_protein.split(';') for mod_prot_ in mod_prots: if not mod_prot_: continue mod_prot, mod_prot_sites = mod_prot_.rsplit( '(', 1) for mod_prot_site in mod_prot_sites[:-1].split( ','): if mod_prot_site: mod_aa, mod_prot_site = mod_prot_site.split( ':') try: peptide_mods[mod_prot].add( (mod_aa, mod_prot_site)) except KeyError: peptide_mods[ mod_prot] = set([ (mod_aa, mod_prot_site) ]) try: mod_stats[mod_aa].add( (mod_prot, mod_prot_site)) except KeyError: mod_stats[mod_aa] = set([ (mod_prot, mod_prot_site) ]) for mod_prot, mods in peptide_mods.iteritems(): modl = list(mods) modl.sort(key=lambda x: x[1]) mod_entry.append( '%s(%s)' % (mod_prot, ' '.join([ '%s:%s' % (i, j) for i, j in modl ]))) entry.append(';'.join(mod_entry)) if ibaq: entry.append(d['inference'].get('iBAQ', 0) if not unique or mapped_peptides. get(peptide, {}).get('unique') else 0) # if peptide.upper() == 'HYNEAVKR': # import pdb; pdb.set_trace(); out_writer.writerow(entry) stats['modifications'] = mod_stats mod_out = args.mod_out if args.mod_out else open( os.path.join('{}_mods'.format(out_file.name)), 'wb') with mod_out as o: writer = csv.writer(o, delimiter=delimiter) header = ['Site', inferred_name, 'Peptide'] if mod_col: header.append(args.mod_col) writer.writerow(header) #mod_grouping[protein] = {'%s%d'%(k, mod_pos): {'values': set([d['mod_col']]), 'peptides': set([peptide])}} for protein, sites_dict in mod_grouping.iteritems(): for site, site_dict in sites_dict.iteritems(): entry = [site, protein, ';'.join(site_dict.get('peptides'))] if mod_col: entry.append(mod_col_func(site_dict.get('values', []))) writer.writerow(entry) # write stats sys.stderr.write('Peptides Searched: %s\n' % stats['peptides']) sys.stderr.write('Unique Peptides Found: %s\n' % stats['peptides_found']) sys.stderr.write('%s Mapped to: %s\n' % (inferred_name, stats['proteins_mapped'])) if stats['modifications']: sys.stderr.write('Modifications:\n') for site, sites in stats['modifications'].iteritems(): sys.stderr.write( ' %s: %s found with %d potential sites (%d mappings)\n' % (site, total_mods[site], len(sites), len(set([i[0] for i in sites]))))
def main(): args = parser.parse_args() file_name = args.fasta enzyme_choice = args.enzyme enzyme_pattern = args.enzyme_pattern digest_type = args.type digest_frame = args.frame digest_negative = False if digest_frame == 6: digest_negative = True digest_frame = 3 digest_min = args.min digest_max = args.max genome = args.genome unique_digest = args.unique #if we're splitting a genome if genome: import re regex = re.compile(r'([\*])') digest_type = 'nt' if digest_type == 'prot' and digest_frame: sys.stderr.write("Protein digestions cannot have a frame.\n") return 1 if digest_type == 'nt' and not digest_frame: sys.stderr.write("Nucleotide digestions must specify the frame.\n") return 1 fasta_file = fasta.FastaIterator(file_name) if enzyme_pattern: enzymes = [digest.Enzyme(pattern=enzyme_pattern)] elif enzyme_choice: enzymes = [ digest.Enzyme(enzyme=protease) for protease in enzyme_choice ] with args.out as o: if digest_type == 'nt': for header, sequence in fasta_file: if genome: slen = len(sequence) for i in xrange(digest_frame): strand = '+' translation = fasta._translate(sequence[i:]) if genome: position = i + 1 translation = [j for j in regex.split(translation)] translation = [ ''.join(j) for j in itertools.izip_longest(translation[0::2], translation[1::2], fillvalue='') ] else: translation = translation.split('*') for protein_index, protein_sequence in enumerate( translation): if genome: enzyme_kwargs = { 'min': 0, 'max': 999999, 'unique': unique_digest } else: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave(protein_sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave( peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): if genome: if len(peptide) >= digest_min: if peptide.endswith('*'): o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position, position + len(peptide) * 3 - 1, peptide[:-1])) else: o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position, position + len(peptide) * 3 - 1, peptide)) position += len(peptide) * 3 else: o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header, strand, i + 1, protein_index + 1, peptide_index + 1, peptide)) if digest_negative: strand = '-' translation = fasta._translate( fasta._reverse_complement(sequence)[i:]) if genome: position = slen - i translation = [j for j in regex.split(translation)] translation = [ ''.join(j) for j in itertools.izip_longest( translation[0::2], translation[1::2], fillvalue='') ] else: translation = translation.split('*') for protein_index, protein_sequence in enumerate( translation): if genome: enzyme_kwargs = { 'min': 0, 'max': 999999, 'unique': unique_digest } else: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave( protein_sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave( peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): if genome: if len(peptide) >= digest_min: if peptide.endswith('*'): o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position - len(peptide) * 3 + 1, position, peptide[:-1])) else: o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position - len(peptide) * 3 + 1, position, peptide)) position -= (len(peptide) * 3) else: o.write( '>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header, strand, i + 1, protein_index + 1, peptide_index + 1, peptide)) else: for header, sequence in fasta_file: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave(sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave(peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): o.write('>%s Pep:%d \n%s\n' % (header, peptide_index + 1, peptide))
def main(): args = parser.parse_args() digest_min = args.min digest_max = args.max enzymes = args.enzyme peptides_found = {} retained = {} total = 0 proteinMap = {} coverageMap = {} aas = config.RESIDUE_MASSES.keys() aas.sort() tlen = 0 parallel = args.parallel for protease_index, protease in enumerate(enzymes): if parallel or protease_index == 0: fasta_file = fasta.FastaIterator(args.fasta) enzyme = digest.Enzyme(enzyme=protease) sys.stderr.write('processing %s\n' % protease) #if doing in series, this iterator is not reset and will never run for header, sequence in fasta_file: if protease_index == 0: total += 1 proteinMap[header] = sequence tlen += len(sequence) for peptide in set( enzyme.cleave(sequence, min=digest_min, max=999999)): if len(peptide) > digest_max: #we don't see this one if not parallel: try: retained[header].add(peptide) except KeyError: retained[header] = set([peptide]) else: #we see this one try: peptides_found[peptide].add(header) except KeyError: peptides_found[peptide] = set([header]) try: coverageMap[header].add(peptide) except KeyError: coverageMap[header] = set([peptide]) if not parallel and protease_index > 0: for header in retained: sequences = copy.deepcopy(retained[header]) for sequence in sequences: for peptide in set( enzyme.cleave(sequence, min=digest_min, max=999999)): if len(peptide) > digest_max: if not parallel: retained[header].add(peptide) else: try: peptides_found[peptide].add(header) except KeyError: peptides_found[peptide] = set([header]) try: coverageMap[header].add(peptide) except KeyError: coverageMap[header] = set([peptide]) sys.stderr.write('%d total peptides after digesting with %s\n' % (len(peptides_found), protease)) if parallel: args.fasta.seek(0) unique_proteins = set([]) for peptide in peptides_found: if len(peptides_found[peptide]) == 1: unique_proteins |= peptides_found[peptide] with args.out as o: o.write( 'Protein\tDetectable Length\tTotal Length\tCoverage%%\tUnique ID\t%s\n' % '\t'.join(aas)) sys.stderr.write( '%d proteins found out of %d total proteins in database.\n' % (len(coverageMap), total)) sys.stderr.write( '%d of these detectable proteins may be uniquely identified.\n' % (len(unique_proteins))) #figure out coverage covered = {} found_proteins = set([]) inum = 0 for peptide in peptides_found: inum += 1 if inum % 50000 == 0: sys.stderr.write('%d peptides processed\n' % inum) for header in peptides_found[peptide]: found_proteins.add(header) sequence = proteinMap[header] found = covered.get(header, set(xrange(len(sequence)))) sites = [ match.start() for match in re.finditer(peptide, sequence) ] for match_position in sites: found -= set( xrange(match_position, match_position + len(peptide))) covered[header] = found avg_cov = 0 missed_len = 0 detected = 0 for header in coverageMap: total_len = len(proteinMap[header]) found_len = total_len - len(covered.get(header, [])) perc_cov = float(found_len) / float(total_len) o.write('%s\t%d\t%d\t%d\t%s\t' % (header, found_len, total_len, perc_cov * 100.0, str(header in unique_proteins))) #what aa's do we miss aas_missed = ''.join(proteinMap[header][i] for i in covered[header]) missed = [aas_missed.count(j) for j in aas] missed_len += sum(missed) o.write('%s\n' % '\t'.join([str(i) for i in missed])) avg_cov += perc_cov if header in found_proteins: detected += 1 sys.stderr.write('average coverage is %0.4f over entire proteome\n' % (float(tlen - missed_len) / float(tlen))) sys.stderr.write('average coverage is %0.4f over detected proteins\n' % (avg_cov / detected))