def test_read_and_write_fasta_short(self): with tempfile.TemporaryFile(mode='r+') as new_fasta_file: fasta.write(fasta.read(self.fasta_file, ignore_comments=True), new_fasta_file) new_fasta_file.seek(0) new_entries = list(fasta.read(new_fasta_file, ignore_comments=True)) self.assertEqual(new_entries, self.fasta_entries_short)
def getAllProts(fastaDir): fastaData = fas.read(fastaDir) listOfProts = [] for prot in fastaData: listOfProts.append(prot.sequence) return listOfProts
def process_fasta(df, path_to_fasta, decoy_prefix, decoy_infix=False): protsS = dict() decoy_check_flag = False for x in fasta.read(path_to_fasta): dbname = x[0].split(' ')[0] if not decoy_check_flag: if (not decoy_infix and dbname.startswith(decoy_prefix)) or ( decoy_infix and decoy_infix in dbname): decoy_check_flag = True protsS[dbname] = x[1] df['sequence'] = df['dbname'].apply( lambda x: protsS.get(x, protsS.get(x.split(' ')[0], ''))) if not decoy_check_flag: if not decoy_infix: df['sequence'] = df.apply( lambda x: x['sequence'] if x['sequence'] else protsS.get( x['dbname'].replace(decoy_prefix, ''), protsS.get( x['dbname'].split(' ')[0].replace(decoy_prefix, ''), '' )), axis=1) else: df['sequence'] = df.apply(lambda x: x['sequence'] if x['sequence'] else protsS.get( x['dbname'].replace(decoy_infix, ''), protsS.get( x['dbname'].split(' ')[0]. replace(decoy_infix, ''), '')), axis=1) return df
def peptide_db_graph(peps, db, id_regex=None): ''' search a set of peptides against a FASTA database ''' g = nx.Graph() prot_dict = dict() for header, seq, in fasta.read(db): seq = seq.replace('I', 'L').upper() # convert DB sequence I -> L prot_id = header.split()[0] if id_regex is not None: find_id = re.findall(id_regex, header) if len(find_id) > 0: prot_id = find_id[0] prot_dict[prot_id] = seq def _map_seq(p): pairs = [] for prot_id, seq, in prot_dict.items(): if p in seq: pairs.append([p, prot_id]) return pairs for p in peps: ppps = _map_seq(p) if len(ppps): g.add_edges_from(ppps) return g
def ingest_fasta(input_filename): """Ingest an fasta file given its name and return a dataframe of the file """ with fasta.read(input_filename) as reader: for entry in reader: prot_list = [[item.description.split("|")[0]+":"+item.description.split("|")[1], item.description.split("|")[3],item.description.split("|")[4],item.sequence] for item in reader] df = pd.DataFrame(prot_list,columns=["GeneInfo ID","Accession","Description","Sequence"]) return df
def make_reverse_fasta(input_file, output_file): ''' Takes as input fasta file, drops all _REVERSED proteins and creates a new _REVERSED decoy proteins. ''' prots = [] for prot_desc, prot_seq in fasta.read(input_file): if not prot_desc.endswith('_REVERSED'): prots.append((prot_desc, prot_seq)) prots.append((prot_desc + '_REVERSED', smart_reverse(prot_seq))) fasta.write(prots, output_file, file_mode='w')
def _load_fasta(db, id_regex): global prot_dict prot_dict = dict() for header, seq, in fasta.read(db): seq = seq.replace('I', 'L').upper() # convert DB sequence I -> L prot_id = header.split()[0] if id_regex is not None: find_id = re.findall(id_regex, header) if len(find_id) > 0: prot_id = find_id[0] prot_dict[prot_id] = seq
def digetsProteinFromFASTA(): sequenceIter = fasta.read(source=options.fasta) uniquePeptides = set() for s in sequenceIter: newPeptides = parser.cleave(s.sequence, 'trypsin', missed_cleavages=options.missed, min_length=options.minLength) uniquePeptides.update(newPeptides) uniquePeptides = list(uniquePeptides) return [Peptide(x) for x in uniquePeptides]
def split_fasta_decoys(db, decoy_prefix, decoy_infix=None): decoy_dbnames = set() with fasta.read(db) as f: for protein in f: dbname = protein.description.split()[0] if (decoy_infix and decoy_infix in dbname) or dbname.startswith(decoy_prefix): decoy_dbnames.add(dbname) decoy_dbnames = sorted(decoy_dbnames) random.seed(SEED) all_decoys_2 = set(random.sample(decoy_dbnames, len(decoy_dbnames) // 2)) logger.debug('Marking %s out of %s decoys as decoy2', len(all_decoys_2), len(decoy_dbnames)) return all_decoys_2
def test_write_decoy_db(self): with tempfile.TemporaryFile(mode='r+') as decdb: fasta.write_decoy_db(self.fasta_file, decdb, decoy_only=False, prefix='PREFIX_') decdb.seek(0) all_entries = list(fasta.read(decdb, False)) self.assertEqual( all_entries, self.fasta_entries_long + [('PREFIX_' + a, b[::-1]) for a, b in self.fasta_entries_long])
def generate_db(fasta_file, bins): peptides = set() with fasta.read(fasta_file) as db: for _, protein in db: peptides |= generate_peptides(protein) mzs = list() for peptide in peptides: mzs.append(compute_mass_spectrum(peptide)) intensity_matrix = lil_matrix((len(mzs), bins), dtype=numpy.int8) for i, intensity in enumerate(mzs): intensity_matrix[i, :] = bin_spectrum(mzs[i], bins=bins) peptides_vector = numpy.array(list(peptides)) mzs_vector = numpy.array(mzs) return peptides_vector, mzs_vector, intensity_matrix
def background_maker(args): # print('Making background DB') #хотим сделать background из идентифицированных белков bg_fasta = dict() # bg = defaultdict() background = set() with fasta.read(args.fasta) as f: for name, sequence in f: name_id = name.split('|')[1] extended_seq = ''.join(['-' * args.interval_length, sequence, '-' * args.interval_length]) bg_fasta[name_id] = extended_seq mod_aa_indexes = re.finditer(args.modification_site, extended_seq) bg_intervals = [extended_seq[i.span()[0] - args.interval_length: i.span()[0] + args.interval_length + 1] for i in mod_aa_indexes] # bg[name_id] = bg_intervals background.update(bg_intervals) logging.info(u'Set of %s background intervals is created', len(background)) logging.debug(u'Background DB is ready') with open('bg.csv', 'w') as f: f.write('\n'.join(background)) return pd.DataFrame([list(i) for i in background], columns=range(-args.interval_length, args.interval_length + 1)), bg_fasta
def read_fasta_sequences(fasta_file): """ Read sequence records from a FASTA file. """ sequence_records = [] for description, sequence in fasta.read(fasta_file): # Initialize sequence record with sequence string. sequence_record = {'sequence': sequence} # Get sequence info. description_parts = description.split() sequence_record['id'] = description_parts[0] # Get the sequence's peptides. sequence_record['peptides'] = parser.cleave( sequence, parser.expasy_rules['trypsin'], missed_cleavages=1 #max no. of missed cleavages. ) # Save the sequence record, keyed by the id. sequence_records.append(sequence_record) return sequence_records
def build(fasta_file: str) -> Database: '''Create a Database namedtuple from a fasta file :param fasta_file: the full path to a fasta database file :type fasta_file: str :returns: a Database object with the fasta file and protein fields filled in :rtype: Database ''' db = Database(fasta_file) prots = defaultdict(list) # pull the name out get_name = lambda x: x.split('|')[-1].split()[0] for entry in fasta.read(fasta_file): p_name = get_name(entry.description) prots[p_name].append(entry) db = db._replace(proteins=prots) return db
def describe(args): """Read database and produce a summary""" logger.debug('describe called with: %s', args) try: dlist = [d for d, seq in fasta.read(args.file)] except Exception as e: logger.info('Not a valid FASTA file.') logger.debug('Exception: %s', e) else: logger.info('Found %s FASTA entries.', len(dlist)) n = len(dlist) if n: logger.debug('First entry: %s', dlist[0]) if n > 2: dlist.sort() prefix_1 = os.path.commonprefix(dlist[:n // 2]) prefix_2 = os.path.commonprefix(dlist[n // 2 + 1:]) if prefix_1 != prefix_2: logger.info('Common prefixes: %s, %s', prefix_1, prefix_2) else: logger.info('Common prefix: %s', prefix_1) formats = [] for flavor in fasta.std_parsers: try: fasta.parse(dlist[0], flavor=flavor) except Exception as e: logger.debug('Header: %s; parsing exception: %s', dlist[0], e) else: formats.append(flavor) k = len(formats) if not k: logger.info('Unknown header format.') elif k == 1: logger.info('Suggested header format: %s', formats[0]) else: logger.info('Possible header formats: %s', ', '.join(formats))
# -*- coding: utf-8 -*- """ Created on Fri Mar 22 11:28:43 2013 @author: ilya """ from pyteomics import fasta, mgf, parser import pylab fasta_file = '/home/ilya/src/pyteomics/RhoEcoli.fasta' mgf_file = '/home/ilya/src/pyteomics/MultiConsensus.mgf' peptides = set() with open(fasta_file) as fi: for description, sequence in fasta.read(fi): new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin']) peptides.update(new_peptides) print "UNIQUE PEPTIDES" print peptides with open(mgf_file) as fi: for spectrum in mgf.read(fi): pylab.figure() pylab.xlabel('m/z, Th') pylab.ylabel('Intensity, rel.units') pylab.bar(spectrum['m/z array'], spectrum['intensity array'], width=0.1, linewidth=2, edgecolor='black') pylab.show() inp = raw_input("Show more?") if inp != "yes":
def prepare_decoy_db(args): add_decoy = args['ad'] if add_decoy: prefix = args['prefix'] db = args['d'] out1, out2 = os.path.splitext(db) out_db = out1 + '_shuffled' + out2 logger.info('Creating decoy database: %s', out_db) extra_check = False if '{' in args['e']: extra_check = True if extra_check: banned_pairs = set() banned_aa = set() for enzyme_local in args['e'].split(','): if '{' in enzyme_local: lpart, rpart = enzyme_local.split('|') for aa_left, aa_right in itertools.product( lpart[1:-1], rpart[1:-1]): banned_aa.add(aa_left) banned_aa.add(aa_right) banned_pairs.add(aa_left + aa_right) logger.debug(banned_aa) logger.debug(banned_pairs) enzyme = get_enzyme(args['e']) cleave_rule_custom = enzyme + '|' + '([BXZUO])' # cleave_rule_custom = '([RKBXZUO])' logger.debug(cleave_rule_custom) shuf_map = dict() prots = [] for p in fasta.read(db): if not p[0].startswith(prefix): target_peptides = [ x[1] for x in parser.icleave(p[1], cleave_rule_custom, 0) ] checked_peptides = set() sample_list = [] for idx, pep in enumerate(target_peptides): if len(pep) > 2: pep_tmp = pep[1:-1] if extra_check: for bp in banned_pairs: if bp in pep_tmp: pep_tmp = pep_tmp.replace(bp, '') checked_peptides.add(idx) sample_list.extend(pep_tmp) random.shuffle(sample_list) idx_for_shuffle = 0 decoy_peptides = [] for idx, pep in enumerate(target_peptides): if len(pep) > 2: if pep in shuf_map: tmp_seq = shuf_map[pep] else: if not extra_check or idx not in checked_peptides: tmp_seq = pep[0] for pep_aa in pep[1:-1]: tmp_seq += sample_list[idx_for_shuffle] idx_for_shuffle += 1 tmp_seq += pep[-1] else: max_l = len(pep) tmp_seq = '' ii = 0 while ii < max_l - 1: # for ii in range(max_l-1): if pep[ii] in banned_aa and pep[ ii + 1] in banned_aa and pep[ii] + pep[ ii + 1] in banned_pairs: tmp_seq += pep[ii] + pep[ii + 1] ii += 1 else: if ii == 0: tmp_seq += pep[ii] else: tmp_seq += sample_list[ idx_for_shuffle] idx_for_shuffle += 1 ii += 1 tmp_seq += pep[max_l - 1] shuf_map[pep] = tmp_seq else: tmp_seq = pep decoy_peptides.append(tmp_seq) assert len(target_peptides) == len(decoy_peptides) prots.append((p[0], ''.join(target_peptides))) prots.append(('DECOY_' + p[0], ''.join(decoy_peptides))) fasta.write(prots, open(out_db, 'w')).close() args['d'] = out_db args['ad'] = 0 return args
sep='\t', header=None, names=['sb1', 'sb2', 'genome', 'start', 'end', 'maxlen']).drop(0) dataset = os.listdir(args.orgs + org + '/All/') for refsta in strains: for strain in dataset: if '.DS_Store' in strain: continue if refsta in strain[:-12]: print('Analyzing of {} ...'.format(refsta)) edge_cake = '' cake = '' seq = '' SBs = '' coord_g = coord.loc[coord.genome == strain[:-12]] fasta = read(args.orgs + org + '/All/' + strain) for line in fasta: seq += line.sequence for i in coord_g.index: if i != 0: if int(coord_g.start[i]) > int(coord_g.end[i]): seq_i = seq[int(coord_g.start[i] ):int(coord_g.end[i]):-1] else: seq_i = seq[int(coord_g.start[i]):int(coord_g. end[i])] if args.cut[-1] == '%': if len(seq_i) < 50: continue left_edge = seq_i[:int( len(seq_i) * float(args.cut[:-1]) * 0.01)]
def run(): parser = argparse.ArgumentParser( description='run DirectMS1quant for ms1searchpy results', epilog=''' Example usage ------------- $ directms1quant -S1 sample1_1_proteins_full.tsv sample1_n_proteins_full.tsv -S2 sample2_1_proteins_full.tsv sample2_n_proteins_full.tsv ------------- ''', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-S1', nargs='+', help='input files for S1 sample', required=True) parser.add_argument('-S2', nargs='+', help='input files for S2 sample', required=True) parser.add_argument('-out', help='name of DirectMS1quant output file', default='directms1quant_out') parser.add_argument( '-min_samples', help= 'minimum number of samples for peptide usage. 0 means 50%% of input files', default=0) parser.add_argument('-fold_change', help='FC threshold standard deviations', default=3.0, type=float) parser.add_argument( '-fold_change_abs', help= 'Use absolute log2 scale FC threshold instead of standard deviations', action='store_true') parser.add_argument('-qval', help='qvalue threshold', default=0.05, type=float) parser.add_argument('-intensity_norm', help='Intensity normalization: 0-none, 1-median', default=1, type=int) parser.add_argument('-all_proteins', help='use all proteins instead of FDR controlled', action='store_true') parser.add_argument('-all_pfms', help='use all PFMs instead of ML controlled', action='store_true') parser.add_argument('-output_peptides', help='Add output table with peptides', action='store_true') parser.add_argument('-d', '-db', help='path to uniprot fasta file for gene annotation') args = vars(parser.parse_args()) logging.basicConfig(format='%(levelname)9s: %(asctime)s %(message)s', datefmt='[%H:%M:%S]', level=logging.INFO) logger = logging.getLogger(__name__) replace_label = '_proteins_full.tsv' fold_change = float(args['fold_change']) df_final = False all_s_lbls = {} allowed_prots = set() allowed_prots_all = set() allowed_peptides = set() for i in range(1, 3, 1): sample_num = 'S%d' % (i, ) if args[sample_num]: for z in args[sample_num]: if not args['all_proteins']: df0 = pd.read_table( z.replace('_proteins_full.tsv', '_proteins.tsv')) allowed_prots.update(df0['dbname']) allowed_prots.update( ['DECOY_' + z for z in df0['dbname'].values]) else: df0 = pd.read_table(z) allowed_prots.update(df0['dbname']) df0 = pd.read_table( z.replace('_proteins_full.tsv', '_PFMs_ML.tsv')) if not args['all_pfms']: df0 = df0[df0['qpreds'] <= 10] allowed_peptides.update(df0['seqs']) logger.info('Total number of TARGET protein GROUPS: %d', len(allowed_prots) / 2) for i in range(1, 3, 1): sample_num = 'S%d' % (i, ) if args.get(sample_num, 0): for z in args[sample_num]: df3 = pd.read_table(z.replace(replace_label, '_PFMs.tsv')) df3 = df3[df3['sequence'].apply( lambda x: x in allowed_peptides)] df3_tmp = df3[df3['proteins'].apply( lambda x: any(z in allowed_prots for z in x.split(';')))] for dbnames in set(df3_tmp['proteins'].values): for dbname in dbnames.split(';'): allowed_prots_all.add(dbname) for i in range(1, 3, 1): sample_num = 'S%d' % (i, ) if args.get(sample_num, 0): all_s_lbls[sample_num] = [] for z in args[sample_num]: label = z.replace(replace_label, '') all_s_lbls[sample_num].append(label) df3 = pd.read_table(z.replace(replace_label, '_PFMs.tsv')) df3 = df3[df3['sequence'].apply( lambda x: x in allowed_peptides)] # allowed_prots2 = set() # df3_tmp = df3[df3['proteins'].apply(lambda x: any(z in allowed_prots for z in x.split(';')))] # for dbnames in set(df3_tmp['proteins'].values): # for dbname in dbnames.split(';'): # allowed_prots2.add(dbname) # print('!', len(allowed_prots), len(allowed_prots2)) # df3 = df3[df3['proteins'].apply(lambda x: any(z in allowed_prots for z in x.split(';')))] df3 = df3[df3['proteins'].apply(lambda x: any( z in allowed_prots_all for z in x.split(';')))] df3['proteins'] = df3['proteins'].apply(lambda x: ';'.join( [z for z in x.split(';') if z in allowed_prots_all])) ### df3['proteins'] = df3['proteins'].apply(lambda x: ';'.join([z for z in x.split(';') if z in allowed_prots])) df3['origseq'] = df3['sequence'] df3['sequence'] = df3['sequence'] + df3['charge'].astype( int).astype(str) + df3['ion_mobility'].astype(str) df3 = df3.sort_values(by='Intensity', ascending=False) df3 = df3.drop_duplicates(subset='sequence') df3[label] = df3['Intensity'] df3['protein'] = df3['proteins'] df3['peptide'] = df3['sequence'] df3 = df3[['origseq', 'peptide', 'protein', label]] if df_final is False: df_final = df3.reset_index(drop=True) else: df_final = df_final.reset_index(drop=True).merge( df3.reset_index(drop=True), on='peptide', how='outer') df_final.protein_x.fillna(value=df_final.protein_y, inplace=True) df_final.origseq_x.fillna(value=df_final.origseq_y, inplace=True) df_final['protein'] = df_final['protein_x'] df_final['origseq'] = df_final['origseq_x'] df_final = df_final.drop( columns=['protein_x', 'protein_y']) df_final = df_final.drop( columns=['origseq_x', 'origseq_y']) logger.info('Total number of peptide sequences used in quantitation: %d', len(set(df_final['origseq']))) # print('Total number of proteins used in quantitation %d' % (len(allowed_prots_all), )) df_final = df_final.assign(protein=df_final['protein'].str.split( ';')).explode('protein').reset_index(drop=True) df_final = df_final.set_index('peptide') df_final['proteins'] = df_final['protein'] df_final = df_final.drop(columns=['protein']) # cols = df_final.columns.tolist() cols = [z for z in df_final.columns.tolist() if not z.startswith('mz_')] cols.remove('proteins') cols.insert(0, 'proteins') df_final = df_final[cols] all_lbls = all_s_lbls['S1'] + all_s_lbls['S2'] df_final_copy = df_final.copy() custom_min_samples = int(args['min_samples']) if custom_min_samples == 0: custom_min_samples = int(len(all_lbls) / 2) df_final = df_final_copy.copy() max_missing = len(all_lbls) - custom_min_samples logger.info('Allowed max number of missing values: %d', max_missing) df_final['nummissing'] = df_final.isna().sum(axis=1) df_final['nonmissing'] = df_final['nummissing'] <= max_missing df_final = df_final[df_final['nonmissing']] logger.info('Total number of PFMs passed missing values threshold: %d', len(df_final)) df_final['S2_mean'] = df_final[all_s_lbls['S2']].mean(axis=1) df_final['S1_mean'] = df_final[all_s_lbls['S1']].mean(axis=1) df_final['FC_raw'] = np.log2(df_final['S2_mean'] / df_final['S1_mean']) FC_max = df_final['FC_raw'].max() FC_min = df_final['FC_raw'].min() df_final.loc[(pd.isna(df_final['S2_mean'])) & (~pd.isna(df_final['S1_mean'])), 'FC_raw'] = FC_min df_final.loc[(~pd.isna(df_final['S2_mean'])) & (pd.isna(df_final['S1_mean'])), 'FC_raw'] = FC_max if args['intensity_norm'] == 1: for cc in all_lbls: # print(cc, df_final[cc].median()) df_final[cc] = df_final[cc] / df_final[cc].median() df_final['S2_mean'] = df_final[all_s_lbls['S2']].mean(axis=1) df_final['S1_mean'] = df_final[all_s_lbls['S1']].mean(axis=1) for cc in all_lbls: df_final[cc] = df_final[cc].fillna(df_final[cc].min()) df_final['p-value'] = list( ttest_ind(df_final[all_s_lbls['S1']].values.astype(float), df_final[all_s_lbls['S2']].values.astype(float), axis=1, nan_policy='omit', equal_var=True)[1]) df_final['p-value'] = df_final['p-value'].astype(float) df_final['p-value'] = df_final['p-value'].fillna(1.0) p_val_threshold = 0.05 df_final['sign'] = df_final['p-value'] <= p_val_threshold df_final['intensity_median'] = df_final[all_s_lbls['S1'] + all_s_lbls['S2']].median(axis=1) df_final['FC'] = np.log2(df_final['S2_mean'] / df_final['S1_mean']) df_final_for_calib = df_final.copy() df_final_for_calib = df_final_for_calib[ ~pd.isna(df_final_for_calib['S1_mean'])] df_final_for_calib = df_final_for_calib[ ~pd.isna(df_final_for_calib['S2_mean'])] df_final_for_calib = df_final_for_calib[~df_final_for_calib['sign']] FC_max = df_final['FC'].max() FC_min = df_final['FC'].min() df_final.loc[(pd.isna(df_final['S2_mean'])) & (~pd.isna(df_final['S1_mean'])), 'FC'] = FC_min df_final.loc[(~pd.isna(df_final['S2_mean'])) & (pd.isna(df_final['S1_mean'])), 'FC'] = FC_max df_final['decoy'] = df_final['proteins'].apply( lambda x: all(z.startswith('DECOY_') for z in x.split(';'))) from scipy.stats import scoreatpercentile from scipy.optimize import curve_fit from scipy import exp def noisygaus(x, a, x0, sigma, b): return a * exp(-(x - x0)**2 / (2 * sigma**2)) + b def calibrate_mass(bwidth, mass_left, mass_right, true_md): bbins = np.arange(-mass_left, mass_right, bwidth) H1, b1 = np.histogram(true_md, bins=bbins) b1 = b1 + bwidth b1 = b1[:-1] popt, pcov = curve_fit(noisygaus, b1, H1, p0=[1, np.median(true_md), 1, 1]) mass_shift, mass_sigma = popt[1], abs(popt[2]) return mass_shift, mass_sigma, pcov[0][0] try: FC_mean, FC_std, covvalue_cor = calibrate_mass( 0.05, -df_final_for_calib['FC'].min(), df_final_for_calib['FC'].max(), df_final_for_calib['FC']) except: FC_mean, FC_std, covvalue_cor = calibrate_mass( 0.1, -df_final_for_calib['FC'].min(), df_final_for_calib['FC'].max(), df_final_for_calib['FC']) # print('df_final_FC', FC_mean, FC_std) # FC_l = FC_mean-fold_change # FC_r = FC_mean+fold_change if not args['fold_change_abs']: fold_change = FC_std * fold_change logger.info('Absolute FC threshold = %.2f', fold_change) FC_l = -fold_change FC_r = fold_change df_final['up'] = df_final['sign'] * (df_final['FC'] >= FC_r) df_final['down'] = df_final['sign'] * (df_final['FC'] <= FC_l) df_final = df_final.sort_values(by=['nummissing', 'intensity_median'], ascending=(True, False)) df_final = df_final.drop_duplicates(subset=('origseq', 'proteins')) up_dict = df_final.groupby('proteins')['up'].sum().to_dict() down_dict = df_final.groupby('proteins')['down'].sum().to_dict() ####### !!!!!!! ####### df_final['up'] = df_final.apply(lambda x: x['up'] if up_dict.get( x['proteins'], 0) >= down_dict.get(x['proteins'], 0) else x['down'], axis=1) protsN = df_final.groupby('proteins')['up'].count().to_dict() prots_up = df_final.groupby('proteins')['up'].sum() N_decoy_total = df_final['decoy'].sum() upreg_decoy_total = df_final[df_final['decoy']]['up'].sum() p_up = upreg_decoy_total / N_decoy_total names_arr = np.array(list(protsN.keys())) logger.info('Total number of proteins used in quantitation: %d', sum(not z.startswith('DECOY_') for z in names_arr)) logger.info('Total number of peptides: %d', len(df_final)) logger.info('Total number of decoy peptides: %d', N_decoy_total) logger.info('Total number of significantly changed decoy peptides: %d', upreg_decoy_total) logger.info( 'Probability of random peptide to be significantly changed: %.3f', p_up) # print(N_decoy_total, upreg_decoy_total, p_up) if args['output_peptides']: df_final.to_csv(path_or_buf=args['out'] + '_quant_peptides.tsv', sep='\t', index=False) v_arr = np.array(list(prots_up.get(k, 0) for k in names_arr)) n_arr = np.array(list(protsN.get(k, 0) for k in names_arr)) all_pvals = calc_sf_all(v_arr, n_arr, p_up) total_set = set() total_set_genes = set() FC_up_dict_basic = df_final.groupby('proteins')['FC'].median().to_dict() FC_up_dict_raw_basic = df_final.groupby( 'proteins')['FC_raw'].median().to_dict() df_final = df_final[df_final['up'] > 0] df_final['bestmissing'] = df_final.groupby( 'proteins')['nummissing'].transform('min') FC_up_dict = df_final[df_final['bestmissing'] == df_final['nummissing']].groupby( 'proteins')['FC'].median().to_dict() FC_up_dict_raw = df_final[df_final['bestmissing'] == df_final['nummissing']].groupby( 'proteins')['FC_raw'].median().to_dict() # FC_up_dict = df_final.groupby('proteins')['FC'].median().to_dict() df_out = pd.DataFrame() df_out['score'] = all_pvals df_out['dbname'] = names_arr df_out['FC'] = df_out['dbname'].apply(lambda x: FC_up_dict.get(x)) df_out['FC_raw'] = df_out['dbname'].apply(lambda x: FC_up_dict_raw.get(x)) df_out.loc[pd.isna(df_out['FC']), 'FC'] = df_out.loc[pd.isna(df_out['FC']), 'dbname'].apply( lambda x: FC_up_dict_basic.get(x)) df_out.loc[pd.isna(df_out['FC_raw']), 'FC_raw'] = df_out.loc[ pd.isna(df_out['FC_raw']), 'dbname'].apply(lambda x: FC_up_dict_raw_basic.get(x)) df_out['v_arr'] = v_arr df_out['n_arr'] = n_arr df_out['decoy'] = df_out['dbname'].str.startswith('DECOY_') df_out = df_out[~df_out['decoy']] df_out['FC_pass'] = (df_out['FC'].abs() >= fold_change) & (df_out['v_arr'] > 0) df_out_BH_multiplier = df_out['FC_pass'].sum() qval_threshold = args['qval'] df_out['p-value'] = 1.0 df_out['BH_pass'] = False df_out = df_out.sort_values(by='score', ascending=False) df_out.loc[df_out['FC_pass'], 'BH_threshold'] = -np.log10( df_out.loc[df_out['FC_pass'], 'score'].rank(ascending=False, method='max') * qval_threshold / df_out_BH_multiplier) df_out.loc[df_out['FC_pass'], 'BH_pass'] = df_out.loc[df_out['FC_pass'], 'score'] > df_out.loc[df_out['FC_pass'], 'BH_threshold'] df_out.loc[df_out['FC_pass'], 'p-value'] = 10**(-df_out.loc[df_out['FC_pass'], 'score']) score_threshold = df_out[df_out['BH_pass']]['score'].min() df_out.loc[df_out['FC_pass'], 'BH_pass'] = df_out.loc[df_out['FC_pass'], 'score'] >= score_threshold df_out.to_csv(path_or_buf=args['out'] + '_quant_full.tsv', sep='\t', index=False) df_out_f = df_out[(df_out['BH_pass']) & (df_out['FC_pass'])] df_out_f.to_csv(path_or_buf=args['out'] + '.tsv', sep='\t', index=False) genes_map = {} if args['d']: for prot, protseq in fasta.read(args['d']): try: prot_name = prot.split('|')[1] except: prot_name = prot try: gene_name = prot.split('GN=')[1].split(' ')[0] except: gene_name = prot genes_map[prot_name] = gene_name for z in set(df_out_f['dbname']): try: prot_name = z.split('|')[1] except: prot_name = z gene_name = genes_map.get(prot_name, prot_name) total_set.add(prot_name) total_set_genes.add(gene_name) logger.info('Total number of significantly changed proteins: %d', len(total_set)) logger.info('Total number of significantly changed genes: %d', len(total_set_genes)) f1 = open(args['out'] + '_proteins_for_stringdb.txt', 'w') for z in total_set: f1.write(z + '\n') f1.close() f1 = open(args['out'] + '_genes_for_stringdb.txt', 'w') for z in total_set_genes: f1.write(z + '\n') f1.close()
plt.rcParams["font.family"] = "Times New Roman" parser = argparse.ArgumentParser(description='Use to find kmers') parser.add_argument('-modifa', type=str, help='modifying_fasta', required=True) parser.add_argument('-reffa', type=str, help='reference_fasta', required=True) parser.add_argument('-save_way', type=str, help='path_of_save', required=True) parser.add_argument('-image_save', type=str, help='image_save_path', required=True) args = parser.parse_args() ref_path = args.reffa mpl.rcParams['figure.figsize'] = [12, 8] print('Analizing of motive abundance of E.coli ...\n') genome = read(args.reffa) syntheny_bloks = read(args.modifa) alphablet = ['A', 'T', 'G', 'C'] new_genome = '' for line in genome: new_genome += line.sequence def create_lib(r): return set([''.join(i) for i in permutations(alphablet * r, r=r)]) all_ends = '' for line in syntheny_bloks: all_ends += line.sequence
check_list = {} checker = [] for i in range(len(p)): if str(p['describtion'][i]) != 'nan': if str(p['describtion'][i]) != 'Description': check_list[str(p['describtion'][i])] = {} checker.append(str(p['describtion'][i])) else: break strains = args.refpa for sta in strains: if '.DS_Store' in sta: continue op_pa = read(args.refpa + '/All/{}'.format(sta)) for line in op_pa: for num in check_list: if num in line.description: check_list[num] = [sta[:-12]] last = [] for ind in check_list: last.append(check_list[ind]) print('Write graph ...') for seq_id in graph: graph[seq_id]['blocks'].sort() blocks = graph[seq_id]['blocks'] for i in range(len(blocks) - 1): if args.refpa != None: out.write('sb{}\tsb{}\t{}\t{}\t{}\t{}\n'.format(
ratio_all_kmers = [] ratio_all_GATC = [] all_p_values_kmers = [] all_p_values_GATC = [] for kmer in create_lib(4): if kmer != 'GATC': for org in organisms: cake = '' end_genome = '' mid_genome = '' if '.DS_Store' in org: continue print('Analyzin of {} ...'.format(org)) mid_genome = read('{}{}/{}_middle_SB.fasta'.format( args.orgs, org, org)) for line in mid_genome: mid_genome = line.sequence end_genome = read('{}{}/{}_ENDS_SB.fasta'.format( args.orgs, org, org)) for line in end_genome: end_genome = line.sequence print('Counting {} ...'.format(kmer)) lable_k.append(kmer) mid = mid_genome.count(kmer) * 100 / (len(mid_genome) - 5) end = end_genome.count(kmer) * 100 / (len(end_genome) - 5) ratio = end / mid ratio_all_kmers.append(ratio) cake += 'Frequency in the ends for every 4-mers\t{}\n'.format(end)
def test_read_and_write_long(self): with tempfile.TemporaryFile(mode='r+') as new_fasta_file: fasta.write(fasta.read(self.fasta_file), new_fasta_file) new_fasta_file.seek(0) new_entries = list(fasta.read(new_fasta_file)) self.assertEqual(new_entries, self.fasta_entries_long)
def run(): parser = argparse.ArgumentParser( description='calculate NSAF for scavager results', epilog=''' Example usage ------------- $ scav2nsaf -S1 sample1_1_proteins.tsv sample1_n_proteins.tsv -S2 sample2_1_proteins.tsv sample2_n_proteins.tsv ------------- ''', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-S1', nargs='+', metavar='FILE', help='input files for S1 sample', required=True) parser.add_argument('-S2', nargs='+', metavar='FILE', help='input files for S2 sample') parser.add_argument('-S3', nargs='+', metavar='FILE', help='input files for S3 sample') parser.add_argument('-S4', nargs='+', metavar='FILE', help='input files for S4 sample') parser.add_argument('-u', '--union', help='pool the files together for the samples', action='store_true') group = parser.add_mutually_exclusive_group() group.add_argument( '-a', '--autolabel', help='in union mode, derive sample labels from common name prefixes', action='store_true') group.add_argument( '--labels', nargs='+', metavar='LABEL', help='labels for samples in union mode (same number as samples)') parser.add_argument('-db', metavar='FILE', help='path to fasta file', required=True) parser.add_argument('-out', metavar='FILE', help='name of nsaf output file', default='nsaf_out.txt') parser.add_argument('-version', action='version', version='%s' % (pkg_resources.require("scavager")[0], )) args = vars(parser.parse_args()) samples = ['S1', 'S2', 'S3', 'S4'] labels = args['labels'] if args['labels'] else samples df_final = False allowed_prots = set() allowed_peptides = set() for sample_num in samples: if args[sample_num]: for z in args[sample_num]: df0 = pd.read_csv(z, sep='\t') allowed_prots.update(df0['dbname']) for sample_num in samples: if args[sample_num]: for z in args[sample_num]: df0 = pd.read_csv(z.replace('_proteins.tsv', '_peptides.tsv'), sep='\t') allowed_peptides.update(df0['peptide']) for sample_num, label in zip(samples, labels): if args[sample_num]: if not args['union']: for z in args[sample_num]: df1 = read_table(z, allowed_peptides, allowed_prots) if df_final is False: df_final = df1 else: df_final = df_final.reset_index().merge( df1.reset_index(), on='peptide', how='outer') #.set_index('peptide') df_final.protein_x.fillna(value=df_final.protein_y, inplace=True) df_final['protein'] = df_final['protein_x'] df_final = df_final.drop(columns=[ 'protein_x', 'protein_y', 'index_x', 'index_y' ]) else: if args['autolabel']: label = os.path.commonprefix(args[sample_num]).rstrip('_') df1 = read_table(args[sample_num], allowed_peptides, allowed_prots, label=label) if df_final is False: df_final = df1 else: df_final = df_final.reset_index().merge( df1.reset_index(), on='peptide', how='outer') #.set_index('peptide') df_final.protein_x.fillna(value=df_final.protein_y, inplace=True) df_final['protein'] = df_final['protein_x'] df_final = df_final.drop(columns=[ 'protein_x', 'protein_y', 'index_x', 'index_y' ]) df_final = df_final.set_index('peptide') df_final['proteins'] = df_final['protein'] df_final = df_final.drop(columns=['protein']) cols = df_final.columns.tolist() cols.remove('proteins') cols.insert(0, 'proteins') df_final = df_final[cols] df_final.fillna(value='') cols = df_final.columns.difference(['proteins']) genres = df_final['proteins'] #.str.split(';') df_final = ( df_final.loc[df_final.index.repeat(genres.str.len()), cols].assign( dbname=list(chain.from_iterable(genres.tolist())))) df_final = df_final.groupby('dbname').sum() df_final.reset_index(level=0, inplace=True) protsL = {} for p in fasta.read(args['db']): dbn = p[0].split()[0] protsL[dbn] = len(p[1]) df_final['Length'] = df_final['dbname'].apply(lambda z: protsL[z]) for cc in df_final.columns: if cc not in ['dbname', 'Length']: df_final[cc] = df_final[cc] / df_final['Length'] for cc in df_final.columns: if cc not in ['dbname', 'Length']: df_final[cc] = df_final[cc] / df_final[cc].sum() df_final[cc] = df_final[cc].replace(0, np.nan) min_val = np.nanmin(df_final[cc].values) df_final[cc] = df_final[cc].replace(np.nan, min_val) df_final.drop(columns=[ 'Length', ], inplace=True) df_final.to_csv(args['out'], sep='\t', index=False)
#file for getting peptides from other enzymes ##import libraries import matplotlib.pyplot as plt import numpy as np from pyteomics import fasta, parser print('hello world') databasePath = 'C:/Users/chris/OneDrive/Documents/bccrc/projectsRepository/sorensenLab/relatedToYbx1/design20200718_multipleProteaseSp3/uniprotHumanJul2020.fasta' print('Cleaving the proteins with trypsin...') uniquePeptides = set() for description, sequence in fasta.read(databasePath): newPeptides = parser.cleave(sequence, 'K', min_length=6) uniquePeptides.update(newPeptides) print('Done, {0} sequences obtained!'.format(len(uniquePeptides)))
import csv from pyteomics.fasta import read import matplotlib.pyplot as plt from scipy.stats import mannwhitneyu import sys plt.style.use('seaborn') parser = argparse.ArgumentParser('Mann-whitney test') parser.add_argument('-stain', help='Path to organisms', required=True) parser.add_argument('-coord', help='Path to coordinate', required=True) parser.add_argument('-save_way', help='Path to save fold', required=True) args = parser.parse_args() print('Starting of analyze ...') fasta = read(args.stain) coord = open(args.coord) read_tsv = pd.DataFrame(csv.reader(coord, delimiter="\t")) read_tsv = read_tsv.T.drop(0) Methyla_coordinates = [] for ind in range(len(read_tsv)): if ind == 0: continue Methyla_coordinates.append([int(read_tsv[0][ind]), int(read_tsv[1][ind])]) SB_coordinates = [] SBs = read(args.stain) SB_dict = {} for line in SBs: if 'GCF_000005845.2_ASM584v2' in line.description:
# In python 3, use 'from urllib.request import urlretrieve' instead import gzip import matplotlib.pyplot as plt import numpy as np from pyteomics import fasta, parser, mass, achrom, electrochem, auxiliary if not os.path.isfile('yeast.fasta.gz'): print 'Downloading the FASTA file for Saccharomyces cerevisiae...' urlretrieve( 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/' 'knowledgebase/proteomes/YEAST.fasta.gz', 'yeast.fasta.gz') print 'Done!' print 'Cleaving the proteins with trypsin...' unique_peptides = set() for description, sequence in fasta.read(gzip.open('yeast.fasta.gz')): new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin']) unique_peptides.update(new_peptides) print 'Done, {0} sequences obtained!'.format(len(unique_peptides)) peptides = [{'sequence': i} for i in unique_peptides] print 'Parsing peptide sequences...' for peptide in peptides: peptide['parsed_sequence'] = parser.parse(peptide['sequence'], show_unmodified_termini=True) peptide['length'] = parser.length(peptide['parsed_sequence']) print 'Done!' peptides = [peptide for peptide in peptides if peptide['length'] <= 100]
res_df.columns = [ "ID1", "ID2", "mass", "mz(z1)", "mz(z2)", "mz(z3)", "mz(z4)" ] res_df['ID1'] = res_df['ID1'].astype(int) res_df['ID2'] = res_df['ID2'].astype(int) b = time.time() print(np.round((b - a) / 60., 2)) return (res_df) #set input params idname = "test" fasta_file = "test.fasta" min_length = 6 protease = "trypsin" crosslinker_mass = 138.06807961 #read database target = fasta.read(fasta_file) decoys = fasta.read(fasta_file) #generate peptides target_df = digest(target, "Target", min_length=min_length, protease=protease) decoy_df = digest(decoys, "Decoy", min_length=min_length, protease=protease) df = pd.concat([target_df, decoy_df]) df = df.reset_index() df.to_csv("{}_PeptidesDigest.csv".format(idname)) pairs = generate_peptidepairs(df, crosslinker_mass, idname) #df_masses = pd.read_csv("test_masses.csv")
arg_parser.add_argument('-m', '--min', type=int, default=6, help='minimal length of the peptide to report') arg_parser.add_argument('-M', '--missed', type=int, default=0, help='number of missed cleavages') arg_parser.add_argument('-e', '--enz', type=str, default='trypsin', help='protease to use for digestion') arg_parser.add_argument('-v', '--verbosity', action='count', help='increase output verbosity') args = arg_parser.parse_args() # TODO: Do it proper way - using os.path out_file = args.fasta_file + '.peptides' peptides = [] with fasta.read(args.fasta_file) as reader, open(out_file,'w') as writer: # Build a set of peptides for each fasta sequence if args.verbosity >= 1: print 'Building digests...' for description, sequence in reader: peps = parser.cleave(sequence, parser.expasy_rules[args.enz], args.missed) peps = [x for x in peps if len(x) > args.min] writer.write('Peptides for {seq} ({enz} cleavage)\n'.format( seq=description, enz=args.enz)) writer.write('...\t{n} missed cleavages\n'.format(n=args.missed)) writer.write('\n'.join(peps)+'\n') peptides.append(set(peps)) if args.verbosity >= 2: print '...\t{n} peptides for {prot}'.format(n=len(peps),prot=description)
orgsnisms = os.listdir(args.orgs) part = '' condition = '' coordinate_of_split = '' check_list = {} #Maiking spike for org in orgsnisms: if '.DS_Store' in org: continue strains = os.listdir('{}{}/All/'.format(args.orgs, org)) for strain in strains: if '.DS_Store' in strain: continue check_list[strain[:-12]] = {} fna = read('{}{}/All/{}'.format(args.orgs, org, strain)) counter = 0 index = 0 for line in fna: counter += len(line.sequence) if index != 0: check_list[strain[:-12]][line.description.split()[0]] = counter else: check_list[strain[:-12]][line.description.split()[0]] = 0 index += 1 for organism in SBs: if '.DS_store' in organism: continue for an_org in coord_prot: if '.DS_Store' in an_org:
continue if 'LCB' in miorg: print('Analyzing {} ...'.format(miorg[4:-6])) save_file = open( args.save_way + '{}_{}&{}.txt'.format(miorg[4:-6], k, k[::-1]), 'w') elif 'SB' in miorg: print('Analyzing {} ...'.format(miorg[3:-6])) save_file = open( args.save_way + '{}_{}&{}.txt'.format(miorg[3:-6], k, k[::-1]), 'w') else: print('Analyzing ...') save_file = open(args.save_way, 'w') fasta = read(args.genome + miorg) for SB in fasta: length = len(SB.sequence) if length > 1000: rich = ( (SB.sequence.count(k[::-1]) + SB.sequence.count(k)) * 100) / (length - (len(k) - 1)) save_file.write( split(split(SB.description)[0])[1] + '\t' + str(len(SB.sequence)) + '\t' + str(rich) + '\n') save_file.close() print('Text file was saved!') else: for k in mixer(kmer): if k in controler:
def prot_gen(args): db = args['d'] with fasta.read(db) as f: for p in f: yield p
ref_path = os.listdir(args.reffa) k = args.k if args.data_type == 'fold' and k <= 3: while not k > 3: if k > 2: mpl.rcParams['font.size'] = 5 if k > 1: mpl.rcParams['figure.figsize'] = [25, 8] else: mpl.rcParams['figure.figsize'] = [15, 8] for path in ref_path: if '.DS_Store' in path: continue print('Analizing of {}\n'.format(path)) genome = read(args.reffa + '{}/{}_middle_SB.fasta'.format(path, path)) syntheny_bloks = read(args.modifa + '{}/{}_ENDS_SB.fasta'.format(path, path)) alphablet = ['A', 'T', 'G' ,'C'] mid_genome = '' for line in genome: mid_genome += line.sequence def create_lib(r): return set([''.join(i) for i in permutations(alphablet * r, r=r)]) all_ends = '' for line in syntheny_bloks: all_ends += line.sequence print('Waiting...\n')
def get_cond_peps(file_in, fasta_file): cols = ['Sequence', 'Proteins', 'Experiment', 'Intensity (normalized)'] raw, headers, inds = openfile(file_in, cols) unique_prots = [] unique_conds = [] unique_bio_reps = [''] count = 0 prot_seqs = [] no_fasta = [] for a in raw: prots = a[inds[1]].split(';') count += 1 for b in prots: prot = b.split('CON__')[-1] prot_seq = '' if prot not in unique_prots and [prot] not in no_fasta: fasta_test = False # print prot for description, sequence in fasta.read(fasta_file): if '|' + prot + '|' in description: fasta_test = True print 'Fasta match found:', prot, count, '/', len(raw) prot_seq = sequence if fasta_test == True: unique_prots.append(prot) prot_seqs.append([prot, prot_seq]) else: print 'No FASTA match:', prot no_fasta.append([prot]) if a[inds[2]] not in unique_conds: unique_conds.append(a[inds[2]]) #if a[inds[4]] not in unique_bio_reps: # unique_bio_reps.append(a[inds[4]]) headers_out = [ 'Protein', 'Experiment', 'Passing Peptides', 'FASTA seq', 'Intensities (normalized)' ] out = [] for prot in unique_prots: for cond in unique_conds: for bio_rep in unique_bio_reps: pep_list = [] intens_list = [] for a in raw: if prot in a[inds[1]]: if cond == a[inds[2]]: if 1 == 1: pep_list.append(a[inds[0]]) intens_list.append(a[inds[3]]) pep_str = '' for i in pep_list: pep_str += i + ';' pep_str = pep_str[:-1] intens_str = '' for i in intens_list: intens_str += i + ';' intens_str = intens_str[:-1] for b in prot_seqs: if b[0] == prot: prot_seq = b[1] entry = [prot, cond, pep_str, prot_seq, intens_str] out.append(entry) print entry file_out = file_in.split('.txt')[0] + '_passpeps.txt' savefile(file_out, out, headers_out)
description was replaced by the gene name. ''' descr, seq = e m = GN_PATTERN.search(descr) new_descr = m.group('gene') or descr return Protein(new_descr, seq) parser = argparse.ArgumentParser() parser.add_argument('files', type=str, nargs='+', help='.fasta files containing plink output') parser.add_argument('-o', '--output', type=str, default=None, help='file to write output to.') parser.add_argument('--gname', action='store_true', help='only leave gene name (GN) in the output fasta header') parser.add_argument('-v', '--verbosity', action='count', help='increase output verbosity') args = parser.parse_args() unique_entries = set() for filename in args.files: if args.verbosity > 0: print 'Processing {0}...'.format(filename) for entry in fasta.read(open(filename)): if args.gname: entry = get_gene_name(entry) unique_entries.update({entry,}) if args.verbosity > 1: print '\t... found {0} unique entries so far'.format(len(unique_entries)) fasta.write(unique_entries, output=args.output)