Пример #1
0
 def test_read_and_write_fasta_short(self):
     with tempfile.TemporaryFile(mode='r+') as new_fasta_file:
         fasta.write(fasta.read(self.fasta_file, ignore_comments=True),
                     new_fasta_file)
         new_fasta_file.seek(0)
         new_entries = list(fasta.read(new_fasta_file,
                                       ignore_comments=True))
         self.assertEqual(new_entries, self.fasta_entries_short)
Пример #2
0
def getAllProts(fastaDir):
    fastaData = fas.read(fastaDir)
    listOfProts = []
    for prot in fastaData:
        listOfProts.append(prot.sequence)

    return listOfProts
Пример #3
0
def process_fasta(df, path_to_fasta, decoy_prefix, decoy_infix=False):
    protsS = dict()
    decoy_check_flag = False
    for x in fasta.read(path_to_fasta):
        dbname = x[0].split(' ')[0]
        if not decoy_check_flag:
            if (not decoy_infix and dbname.startswith(decoy_prefix)) or (
                    decoy_infix and decoy_infix in dbname):
                decoy_check_flag = True
        protsS[dbname] = x[1]
    df['sequence'] = df['dbname'].apply(
        lambda x: protsS.get(x, protsS.get(x.split(' ')[0], '')))
    if not decoy_check_flag:
        if not decoy_infix:
            df['sequence'] = df.apply(
                lambda x: x['sequence'] if x['sequence'] else protsS.get(
                    x['dbname'].replace(decoy_prefix, ''),
                    protsS.get(
                        x['dbname'].split(' ')[0].replace(decoy_prefix, ''), ''
                    )),
                axis=1)
        else:
            df['sequence'] = df.apply(lambda x: x['sequence']
                                      if x['sequence'] else protsS.get(
                                          x['dbname'].replace(decoy_infix, ''),
                                          protsS.get(
                                              x['dbname'].split(' ')[0].
                                              replace(decoy_infix, ''), '')),
                                      axis=1)

    return df
Пример #4
0
def peptide_db_graph(peps, db, id_regex=None):
    ''' search a set of peptides against a FASTA database  '''
    g = nx.Graph()
    prot_dict = dict()
    for header, seq, in fasta.read(db):
        seq = seq.replace('I', 'L').upper()  # convert DB sequence I -> L
        prot_id = header.split()[0]
        if id_regex is not None:
            find_id = re.findall(id_regex, header)
            if len(find_id) > 0:
                prot_id = find_id[0]
        prot_dict[prot_id] = seq

    def _map_seq(p):
        pairs = []
        for prot_id, seq, in prot_dict.items():
            if p in seq:
                pairs.append([p, prot_id])
        return pairs

    for p in peps:
        ppps = _map_seq(p)
        if len(ppps):
            g.add_edges_from(ppps)
    return g
Пример #5
0
def ingest_fasta(input_filename):
    """Ingest an fasta file given its name and return a dataframe of the file
    """
    with fasta.read(input_filename) as reader:
        for entry in reader:
            prot_list = [[item.description.split("|")[0]+":"+item.description.split("|")[1],
                          item.description.split("|")[3],item.description.split("|")[4],item.sequence] for item in reader]
    df = pd.DataFrame(prot_list,columns=["GeneInfo ID","Accession","Description","Sequence"])
    return df
Пример #6
0
def make_reverse_fasta(input_file, output_file):
    '''
    Takes as input fasta file, drops all _REVERSED proteins and creates a new _REVERSED decoy proteins.
    '''
    prots = []
    for prot_desc, prot_seq in fasta.read(input_file):
        if not prot_desc.endswith('_REVERSED'):
            prots.append((prot_desc, prot_seq))
            prots.append((prot_desc + '_REVERSED', smart_reverse(prot_seq)))
    fasta.write(prots, output_file, file_mode='w')
Пример #7
0
def _load_fasta(db, id_regex):
    global prot_dict
    prot_dict = dict()
    for header, seq, in fasta.read(db):
        seq = seq.replace('I', 'L').upper()  # convert DB sequence I -> L
        prot_id = header.split()[0]
        if id_regex is not None:
            find_id = re.findall(id_regex, header)
            if len(find_id) > 0:
                prot_id = find_id[0]
        prot_dict[prot_id] = seq
Пример #8
0
def digetsProteinFromFASTA():
    sequenceIter = fasta.read(source=options.fasta)
    uniquePeptides = set()
    for s in sequenceIter:
        newPeptides = parser.cleave(s.sequence,
                                    'trypsin',
                                    missed_cleavages=options.missed,
                                    min_length=options.minLength)
        uniquePeptides.update(newPeptides)

    uniquePeptides = list(uniquePeptides)
    return [Peptide(x) for x in uniquePeptides]
Пример #9
0
def split_fasta_decoys(db, decoy_prefix, decoy_infix=None):
    decoy_dbnames = set()
    with fasta.read(db) as f:
        for protein in f:
            dbname = protein.description.split()[0]
            if (decoy_infix and decoy_infix in dbname) or dbname.startswith(decoy_prefix):
                decoy_dbnames.add(dbname)
    decoy_dbnames = sorted(decoy_dbnames)
    random.seed(SEED)
    all_decoys_2 = set(random.sample(decoy_dbnames, len(decoy_dbnames) // 2))
    logger.debug('Marking %s out of %s decoys as decoy2', len(all_decoys_2), len(decoy_dbnames))
    return all_decoys_2
Пример #10
0
 def test_write_decoy_db(self):
     with tempfile.TemporaryFile(mode='r+') as decdb:
         fasta.write_decoy_db(self.fasta_file,
                              decdb,
                              decoy_only=False,
                              prefix='PREFIX_')
         decdb.seek(0)
         all_entries = list(fasta.read(decdb, False))
     self.assertEqual(
         all_entries,
         self.fasta_entries_long + [('PREFIX_' + a, b[::-1])
                                    for a, b in self.fasta_entries_long])
Пример #11
0
def generate_db(fasta_file, bins):
    peptides = set()
    with fasta.read(fasta_file) as db:
        for _, protein in db:
            peptides |= generate_peptides(protein)

    mzs = list()
    for peptide in peptides:
        mzs.append(compute_mass_spectrum(peptide))

    intensity_matrix = lil_matrix((len(mzs), bins), dtype=numpy.int8)
    for i, intensity in enumerate(mzs):
        intensity_matrix[i, :] = bin_spectrum(mzs[i], bins=bins)

    peptides_vector = numpy.array(list(peptides))
    mzs_vector = numpy.array(mzs)
    return peptides_vector, mzs_vector, intensity_matrix
Пример #12
0
def background_maker(args):
#    print('Making background DB')
    #хотим сделать background из идентифицированных белков
    bg_fasta = dict()
    # bg = defaultdict()
    background = set()
    with fasta.read(args.fasta) as f:
        for name, sequence in f:
            name_id = name.split('|')[1]
            extended_seq = ''.join(['-' * args.interval_length, sequence, '-' * args.interval_length])
            bg_fasta[name_id] = extended_seq
            mod_aa_indexes = re.finditer(args.modification_site, extended_seq)
            bg_intervals = [extended_seq[i.span()[0] - args.interval_length: i.span()[0] + args.interval_length + 1] for i in mod_aa_indexes]
            # bg[name_id] = bg_intervals
            background.update(bg_intervals)

    logging.info(u'Set of %s background intervals is created', len(background))
    logging.debug(u'Background DB is ready')    
    with open('bg.csv', 'w') as f:
        f.write('\n'.join(background))
    return pd.DataFrame([list(i) for i in background], columns=range(-args.interval_length, args.interval_length + 1)), bg_fasta   
def read_fasta_sequences(fasta_file):
    """ Read sequence records from a FASTA file. """
    sequence_records = []
    for description, sequence in fasta.read(fasta_file):
        # Initialize sequence record with sequence string.
        sequence_record = {'sequence': sequence}

        # Get sequence info.
        description_parts = description.split()
        sequence_record['id'] = description_parts[0]

        # Get the sequence's peptides.
        sequence_record['peptides'] = parser.cleave(
            sequence, 
            parser.expasy_rules['trypsin'],
            missed_cleavages=1 #max no. of missed cleavages.
        )

        # Save the sequence record, keyed by the id.
        sequence_records.append(sequence_record)

    return sequence_records
Пример #14
0
def build(fasta_file: str) -> Database:
    '''Create a Database namedtuple from a fasta file

    :param fasta_file: the full path to a fasta database file 
    :type fasta_file: str

    :returns: a Database object with the fasta file and protein fields filled in
    :rtype: Database
    '''

    db = Database(fasta_file)

    prots = defaultdict(list)

    # pull the name out
    get_name = lambda x: x.split('|')[-1].split()[0]

    for entry in fasta.read(fasta_file):
        p_name = get_name(entry.description)
        prots[p_name].append(entry)

    db = db._replace(proteins=prots)
    return db
Пример #15
0
def describe(args):
    """Read database and produce a summary"""
    logger.debug('describe called with: %s', args)
    try:
        dlist = [d for d, seq in fasta.read(args.file)]
    except Exception as e:
        logger.info('Not a valid FASTA file.')
        logger.debug('Exception: %s', e)
    else:
        logger.info('Found %s FASTA entries.', len(dlist))
        n = len(dlist)
        if n:
            logger.debug('First entry: %s', dlist[0])
            if n > 2:
                dlist.sort()
                prefix_1 = os.path.commonprefix(dlist[:n // 2])
                prefix_2 = os.path.commonprefix(dlist[n // 2 + 1:])
                if prefix_1 != prefix_2:
                    logger.info('Common prefixes: %s, %s', prefix_1, prefix_2)
                else:
                    logger.info('Common prefix: %s', prefix_1)
            formats = []
            for flavor in fasta.std_parsers:
                try:
                    fasta.parse(dlist[0], flavor=flavor)
                except Exception as e:
                    logger.debug('Header: %s; parsing exception: %s', dlist[0],
                                 e)
                else:
                    formats.append(flavor)
            k = len(formats)
            if not k:
                logger.info('Unknown header format.')
            elif k == 1:
                logger.info('Suggested header format: %s', formats[0])
            else:
                logger.info('Possible header formats: %s', ', '.join(formats))
Пример #16
0
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 22 11:28:43 2013

@author: ilya
"""

from pyteomics import fasta, mgf, parser
import pylab

fasta_file = '/home/ilya/src/pyteomics/RhoEcoli.fasta'
mgf_file = '/home/ilya/src/pyteomics/MultiConsensus.mgf'

peptides = set()
with open(fasta_file) as fi:
    for description, sequence in fasta.read(fi):
        new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin'])
        peptides.update(new_peptides)
        
print "UNIQUE PEPTIDES"
print peptides

with open(mgf_file) as fi:
    for spectrum in mgf.read(fi):
        pylab.figure()
        pylab.xlabel('m/z, Th')
        pylab.ylabel('Intensity, rel.units')
        pylab.bar(spectrum['m/z array'], spectrum['intensity array'], width=0.1, linewidth=2, edgecolor='black')
        pylab.show()
        inp = raw_input("Show more?")
        if inp != "yes":
Пример #17
0
def prepare_decoy_db(args):
    add_decoy = args['ad']
    if add_decoy:

        prefix = args['prefix']
        db = args['d']
        out1, out2 = os.path.splitext(db)
        out_db = out1 + '_shuffled' + out2
        logger.info('Creating decoy database: %s', out_db)

        extra_check = False
        if '{' in args['e']:
            extra_check = True
        if extra_check:
            banned_pairs = set()
            banned_aa = set()
            for enzyme_local in args['e'].split(','):
                if '{' in enzyme_local:
                    lpart, rpart = enzyme_local.split('|')
                    for aa_left, aa_right in itertools.product(
                            lpart[1:-1], rpart[1:-1]):
                        banned_aa.add(aa_left)
                        banned_aa.add(aa_right)
                        banned_pairs.add(aa_left + aa_right)

            logger.debug(banned_aa)
            logger.debug(banned_pairs)

        enzyme = get_enzyme(args['e'])
        cleave_rule_custom = enzyme + '|' + '([BXZUO])'
        # cleave_rule_custom = '([RKBXZUO])'
        logger.debug(cleave_rule_custom)

        shuf_map = dict()

        prots = []

        for p in fasta.read(db):
            if not p[0].startswith(prefix):
                target_peptides = [
                    x[1] for x in parser.icleave(p[1], cleave_rule_custom, 0)
                ]

                checked_peptides = set()
                sample_list = []
                for idx, pep in enumerate(target_peptides):

                    if len(pep) > 2:
                        pep_tmp = pep[1:-1]
                        if extra_check:
                            for bp in banned_pairs:
                                if bp in pep_tmp:
                                    pep_tmp = pep_tmp.replace(bp, '')
                                    checked_peptides.add(idx)

                        sample_list.extend(pep_tmp)
                random.shuffle(sample_list)
                idx_for_shuffle = 0

                decoy_peptides = []
                for idx, pep in enumerate(target_peptides):

                    if len(pep) > 2:

                        if pep in shuf_map:
                            tmp_seq = shuf_map[pep]
                        else:
                            if not extra_check or idx not in checked_peptides:
                                tmp_seq = pep[0]
                                for pep_aa in pep[1:-1]:
                                    tmp_seq += sample_list[idx_for_shuffle]
                                    idx_for_shuffle += 1
                                tmp_seq += pep[-1]
                            else:
                                max_l = len(pep)
                                tmp_seq = ''
                                ii = 0
                                while ii < max_l - 1:
                                    # for ii in range(max_l-1):
                                    if pep[ii] in banned_aa and pep[
                                            ii +
                                            1] in banned_aa and pep[ii] + pep[
                                                ii + 1] in banned_pairs:
                                        tmp_seq += pep[ii] + pep[ii + 1]
                                        ii += 1
                                    else:
                                        if ii == 0:
                                            tmp_seq += pep[ii]
                                        else:
                                            tmp_seq += sample_list[
                                                idx_for_shuffle]
                                            idx_for_shuffle += 1

                                    ii += 1
                                tmp_seq += pep[max_l - 1]

                            shuf_map[pep] = tmp_seq
                    else:
                        tmp_seq = pep

                    decoy_peptides.append(tmp_seq)

                assert len(target_peptides) == len(decoy_peptides)

                prots.append((p[0], ''.join(target_peptides)))
                prots.append(('DECOY_' + p[0], ''.join(decoy_peptides)))

        fasta.write(prots, open(out_db, 'w')).close()
        args['d'] = out_db
        args['ad'] = 0
    return args
Пример #18
0
     sep='\t',
     header=None,
     names=['sb1', 'sb2', 'genome', 'start', 'end', 'maxlen']).drop(0)
 dataset = os.listdir(args.orgs + org + '/All/')
 for refsta in strains:
     for strain in dataset:
         if '.DS_Store' in strain:
             continue
         if refsta in strain[:-12]:
             print('Analyzing of {} ...'.format(refsta))
             edge_cake = ''
             cake = ''
             seq = ''
             SBs = ''
             coord_g = coord.loc[coord.genome == strain[:-12]]
             fasta = read(args.orgs + org + '/All/' + strain)
             for line in fasta:
                 seq += line.sequence
             for i in coord_g.index:
                 if i != 0:
                     if int(coord_g.start[i]) > int(coord_g.end[i]):
                         seq_i = seq[int(coord_g.start[i]
                                         ):int(coord_g.end[i]):-1]
                     else:
                         seq_i = seq[int(coord_g.start[i]):int(coord_g.
                                                               end[i])]
                     if args.cut[-1] == '%':
                         if len(seq_i) < 50:
                             continue
                         left_edge = seq_i[:int(
                             len(seq_i) * float(args.cut[:-1]) * 0.01)]
Пример #19
0
def run():
    parser = argparse.ArgumentParser(
        description='run DirectMS1quant for ms1searchpy results',
        epilog='''

    Example usage
    -------------
    $ directms1quant -S1 sample1_1_proteins_full.tsv sample1_n_proteins_full.tsv -S2 sample2_1_proteins_full.tsv sample2_n_proteins_full.tsv
    -------------
    ''',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-S1',
                        nargs='+',
                        help='input files for S1 sample',
                        required=True)
    parser.add_argument('-S2',
                        nargs='+',
                        help='input files for S2 sample',
                        required=True)
    parser.add_argument('-out',
                        help='name of DirectMS1quant output file',
                        default='directms1quant_out')
    parser.add_argument(
        '-min_samples',
        help=
        'minimum number of samples for peptide usage. 0 means 50%% of input files',
        default=0)
    parser.add_argument('-fold_change',
                        help='FC threshold standard deviations',
                        default=3.0,
                        type=float)
    parser.add_argument(
        '-fold_change_abs',
        help=
        'Use absolute log2 scale FC threshold instead of standard deviations',
        action='store_true')
    parser.add_argument('-qval',
                        help='qvalue threshold',
                        default=0.05,
                        type=float)
    parser.add_argument('-intensity_norm',
                        help='Intensity normalization: 0-none, 1-median',
                        default=1,
                        type=int)
    parser.add_argument('-all_proteins',
                        help='use all proteins instead of FDR controlled',
                        action='store_true')
    parser.add_argument('-all_pfms',
                        help='use all PFMs instead of ML controlled',
                        action='store_true')
    parser.add_argument('-output_peptides',
                        help='Add output table with peptides',
                        action='store_true')
    parser.add_argument('-d',
                        '-db',
                        help='path to uniprot fasta file for gene annotation')
    args = vars(parser.parse_args())
    logging.basicConfig(format='%(levelname)9s: %(asctime)s %(message)s',
                        datefmt='[%H:%M:%S]',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)

    replace_label = '_proteins_full.tsv'

    fold_change = float(args['fold_change'])

    df_final = False

    all_s_lbls = {}

    allowed_prots = set()
    allowed_prots_all = set()
    allowed_peptides = set()

    for i in range(1, 3, 1):
        sample_num = 'S%d' % (i, )
        if args[sample_num]:
            for z in args[sample_num]:
                if not args['all_proteins']:
                    df0 = pd.read_table(
                        z.replace('_proteins_full.tsv', '_proteins.tsv'))
                    allowed_prots.update(df0['dbname'])
                    allowed_prots.update(
                        ['DECOY_' + z for z in df0['dbname'].values])
                else:
                    df0 = pd.read_table(z)
                    allowed_prots.update(df0['dbname'])

                df0 = pd.read_table(
                    z.replace('_proteins_full.tsv', '_PFMs_ML.tsv'))

                if not args['all_pfms']:

                    df0 = df0[df0['qpreds'] <= 10]
                allowed_peptides.update(df0['seqs'])

    logger.info('Total number of TARGET protein GROUPS: %d',
                len(allowed_prots) / 2)

    for i in range(1, 3, 1):
        sample_num = 'S%d' % (i, )
        if args.get(sample_num, 0):
            for z in args[sample_num]:
                df3 = pd.read_table(z.replace(replace_label, '_PFMs.tsv'))
                df3 = df3[df3['sequence'].apply(
                    lambda x: x in allowed_peptides)]

                df3_tmp = df3[df3['proteins'].apply(
                    lambda x: any(z in allowed_prots for z in x.split(';')))]
                for dbnames in set(df3_tmp['proteins'].values):
                    for dbname in dbnames.split(';'):
                        allowed_prots_all.add(dbname)

    for i in range(1, 3, 1):
        sample_num = 'S%d' % (i, )
        if args.get(sample_num, 0):
            all_s_lbls[sample_num] = []
            for z in args[sample_num]:
                label = z.replace(replace_label, '')
                all_s_lbls[sample_num].append(label)
                df3 = pd.read_table(z.replace(replace_label, '_PFMs.tsv'))

                df3 = df3[df3['sequence'].apply(
                    lambda x: x in allowed_peptides)]

                # allowed_prots2 = set()
                # df3_tmp = df3[df3['proteins'].apply(lambda x: any(z in allowed_prots for z in x.split(';')))]
                # for dbnames in set(df3_tmp['proteins'].values):
                #     for dbname in dbnames.split(';'):
                #         allowed_prots2.add(dbname)
                # print('!', len(allowed_prots), len(allowed_prots2))

                # df3 = df3[df3['proteins'].apply(lambda x: any(z in allowed_prots for z in x.split(';')))]

                df3 = df3[df3['proteins'].apply(lambda x: any(
                    z in allowed_prots_all for z in x.split(';')))]
                df3['proteins'] = df3['proteins'].apply(lambda x: ';'.join(
                    [z for z in x.split(';') if z in allowed_prots_all]))
                ### df3['proteins'] = df3['proteins'].apply(lambda x: ';'.join([z for z in x.split(';') if z in allowed_prots]))

                df3['origseq'] = df3['sequence']
                df3['sequence'] = df3['sequence'] + df3['charge'].astype(
                    int).astype(str) + df3['ion_mobility'].astype(str)

                df3 = df3.sort_values(by='Intensity', ascending=False)

                df3 = df3.drop_duplicates(subset='sequence')

                df3[label] = df3['Intensity']
                df3['protein'] = df3['proteins']
                df3['peptide'] = df3['sequence']
                df3 = df3[['origseq', 'peptide', 'protein', label]]

                if df_final is False:
                    df_final = df3.reset_index(drop=True)
                else:
                    df_final = df_final.reset_index(drop=True).merge(
                        df3.reset_index(drop=True), on='peptide', how='outer')
                    df_final.protein_x.fillna(value=df_final.protein_y,
                                              inplace=True)
                    df_final.origseq_x.fillna(value=df_final.origseq_y,
                                              inplace=True)
                    df_final['protein'] = df_final['protein_x']
                    df_final['origseq'] = df_final['origseq_x']

                    df_final = df_final.drop(
                        columns=['protein_x', 'protein_y'])
                    df_final = df_final.drop(
                        columns=['origseq_x', 'origseq_y'])

    logger.info('Total number of peptide sequences used in quantitation: %d',
                len(set(df_final['origseq'])))
    # print('Total number of proteins used in quantitation %d' % (len(allowed_prots_all), ))

    df_final = df_final.assign(protein=df_final['protein'].str.split(
        ';')).explode('protein').reset_index(drop=True)

    df_final = df_final.set_index('peptide')
    df_final['proteins'] = df_final['protein']
    df_final = df_final.drop(columns=['protein'])
    # cols = df_final.columns.tolist()
    cols = [z for z in df_final.columns.tolist() if not z.startswith('mz_')]
    cols.remove('proteins')
    cols.insert(0, 'proteins')
    df_final = df_final[cols]

    all_lbls = all_s_lbls['S1'] + all_s_lbls['S2']

    df_final_copy = df_final.copy()

    custom_min_samples = int(args['min_samples'])
    if custom_min_samples == 0:
        custom_min_samples = int(len(all_lbls) / 2)

    df_final = df_final_copy.copy()

    max_missing = len(all_lbls) - custom_min_samples

    logger.info('Allowed max number of missing values: %d', max_missing)

    df_final['nummissing'] = df_final.isna().sum(axis=1)
    df_final['nonmissing'] = df_final['nummissing'] <= max_missing

    df_final = df_final[df_final['nonmissing']]
    logger.info('Total number of PFMs passed missing values threshold: %d',
                len(df_final))

    df_final['S2_mean'] = df_final[all_s_lbls['S2']].mean(axis=1)
    df_final['S1_mean'] = df_final[all_s_lbls['S1']].mean(axis=1)
    df_final['FC_raw'] = np.log2(df_final['S2_mean'] / df_final['S1_mean'])

    FC_max = df_final['FC_raw'].max()
    FC_min = df_final['FC_raw'].min()

    df_final.loc[(pd.isna(df_final['S2_mean'])) &
                 (~pd.isna(df_final['S1_mean'])), 'FC_raw'] = FC_min
    df_final.loc[(~pd.isna(df_final['S2_mean'])) &
                 (pd.isna(df_final['S1_mean'])), 'FC_raw'] = FC_max

    if args['intensity_norm'] == 1:
        for cc in all_lbls:
            # print(cc, df_final[cc].median())
            df_final[cc] = df_final[cc] / df_final[cc].median()

    df_final['S2_mean'] = df_final[all_s_lbls['S2']].mean(axis=1)
    df_final['S1_mean'] = df_final[all_s_lbls['S1']].mean(axis=1)

    for cc in all_lbls:
        df_final[cc] = df_final[cc].fillna(df_final[cc].min())

    df_final['p-value'] = list(
        ttest_ind(df_final[all_s_lbls['S1']].values.astype(float),
                  df_final[all_s_lbls['S2']].values.astype(float),
                  axis=1,
                  nan_policy='omit',
                  equal_var=True)[1])
    df_final['p-value'] = df_final['p-value'].astype(float)
    df_final['p-value'] = df_final['p-value'].fillna(1.0)

    p_val_threshold = 0.05

    df_final['sign'] = df_final['p-value'] <= p_val_threshold

    df_final['intensity_median'] = df_final[all_s_lbls['S1'] +
                                            all_s_lbls['S2']].median(axis=1)

    df_final['FC'] = np.log2(df_final['S2_mean'] / df_final['S1_mean'])

    df_final_for_calib = df_final.copy()
    df_final_for_calib = df_final_for_calib[
        ~pd.isna(df_final_for_calib['S1_mean'])]
    df_final_for_calib = df_final_for_calib[
        ~pd.isna(df_final_for_calib['S2_mean'])]
    df_final_for_calib = df_final_for_calib[~df_final_for_calib['sign']]

    FC_max = df_final['FC'].max()
    FC_min = df_final['FC'].min()

    df_final.loc[(pd.isna(df_final['S2_mean'])) &
                 (~pd.isna(df_final['S1_mean'])), 'FC'] = FC_min
    df_final.loc[(~pd.isna(df_final['S2_mean'])) &
                 (pd.isna(df_final['S1_mean'])), 'FC'] = FC_max

    df_final['decoy'] = df_final['proteins'].apply(
        lambda x: all(z.startswith('DECOY_') for z in x.split(';')))

    from scipy.stats import scoreatpercentile
    from scipy.optimize import curve_fit
    from scipy import exp

    def noisygaus(x, a, x0, sigma, b):
        return a * exp(-(x - x0)**2 / (2 * sigma**2)) + b

    def calibrate_mass(bwidth, mass_left, mass_right, true_md):

        bbins = np.arange(-mass_left, mass_right, bwidth)
        H1, b1 = np.histogram(true_md, bins=bbins)
        b1 = b1 + bwidth
        b1 = b1[:-1]

        popt, pcov = curve_fit(noisygaus,
                               b1,
                               H1,
                               p0=[1, np.median(true_md), 1, 1])
        mass_shift, mass_sigma = popt[1], abs(popt[2])
        return mass_shift, mass_sigma, pcov[0][0]

    try:
        FC_mean, FC_std, covvalue_cor = calibrate_mass(
            0.05, -df_final_for_calib['FC'].min(),
            df_final_for_calib['FC'].max(), df_final_for_calib['FC'])
    except:
        FC_mean, FC_std, covvalue_cor = calibrate_mass(
            0.1, -df_final_for_calib['FC'].min(),
            df_final_for_calib['FC'].max(), df_final_for_calib['FC'])
    # print('df_final_FC', FC_mean, FC_std)

    # FC_l = FC_mean-fold_change
    # FC_r = FC_mean+fold_change

    if not args['fold_change_abs']:
        fold_change = FC_std * fold_change
    logger.info('Absolute FC threshold = %.2f', fold_change)
    FC_l = -fold_change
    FC_r = fold_change

    df_final['up'] = df_final['sign'] * (df_final['FC'] >= FC_r)
    df_final['down'] = df_final['sign'] * (df_final['FC'] <= FC_l)

    df_final = df_final.sort_values(by=['nummissing', 'intensity_median'],
                                    ascending=(True, False))
    df_final = df_final.drop_duplicates(subset=('origseq', 'proteins'))

    up_dict = df_final.groupby('proteins')['up'].sum().to_dict()
    down_dict = df_final.groupby('proteins')['down'].sum().to_dict()

    ####### !!!!!!! #######
    df_final['up'] = df_final.apply(lambda x: x['up'] if up_dict.get(
        x['proteins'], 0) >= down_dict.get(x['proteins'], 0) else x['down'],
                                    axis=1)
    protsN = df_final.groupby('proteins')['up'].count().to_dict()

    prots_up = df_final.groupby('proteins')['up'].sum()

    N_decoy_total = df_final['decoy'].sum()

    upreg_decoy_total = df_final[df_final['decoy']]['up'].sum()

    p_up = upreg_decoy_total / N_decoy_total

    names_arr = np.array(list(protsN.keys()))

    logger.info('Total number of proteins used in quantitation: %d',
                sum(not z.startswith('DECOY_') for z in names_arr))
    logger.info('Total number of peptides: %d', len(df_final))
    logger.info('Total number of decoy peptides: %d', N_decoy_total)
    logger.info('Total number of significantly changed decoy peptides: %d',
                upreg_decoy_total)
    logger.info(
        'Probability of random peptide to be significantly changed: %.3f',
        p_up)
    # print(N_decoy_total, upreg_decoy_total, p_up)

    if args['output_peptides']:
        df_final.to_csv(path_or_buf=args['out'] + '_quant_peptides.tsv',
                        sep='\t',
                        index=False)

    v_arr = np.array(list(prots_up.get(k, 0) for k in names_arr))
    n_arr = np.array(list(protsN.get(k, 0) for k in names_arr))

    all_pvals = calc_sf_all(v_arr, n_arr, p_up)

    total_set = set()
    total_set_genes = set()

    FC_up_dict_basic = df_final.groupby('proteins')['FC'].median().to_dict()
    FC_up_dict_raw_basic = df_final.groupby(
        'proteins')['FC_raw'].median().to_dict()

    df_final = df_final[df_final['up'] > 0]

    df_final['bestmissing'] = df_final.groupby(
        'proteins')['nummissing'].transform('min')

    FC_up_dict = df_final[df_final['bestmissing'] ==
                          df_final['nummissing']].groupby(
                              'proteins')['FC'].median().to_dict()
    FC_up_dict_raw = df_final[df_final['bestmissing'] ==
                              df_final['nummissing']].groupby(
                                  'proteins')['FC_raw'].median().to_dict()

    # FC_up_dict = df_final.groupby('proteins')['FC'].median().to_dict()

    df_out = pd.DataFrame()
    df_out['score'] = all_pvals
    df_out['dbname'] = names_arr

    df_out['FC'] = df_out['dbname'].apply(lambda x: FC_up_dict.get(x))
    df_out['FC_raw'] = df_out['dbname'].apply(lambda x: FC_up_dict_raw.get(x))

    df_out.loc[pd.isna(df_out['FC']),
               'FC'] = df_out.loc[pd.isna(df_out['FC']), 'dbname'].apply(
                   lambda x: FC_up_dict_basic.get(x))
    df_out.loc[pd.isna(df_out['FC_raw']), 'FC_raw'] = df_out.loc[
        pd.isna(df_out['FC_raw']),
        'dbname'].apply(lambda x: FC_up_dict_raw_basic.get(x))

    df_out['v_arr'] = v_arr
    df_out['n_arr'] = n_arr

    df_out['decoy'] = df_out['dbname'].str.startswith('DECOY_')

    df_out = df_out[~df_out['decoy']]

    df_out['FC_pass'] = (df_out['FC'].abs() >= fold_change) & (df_out['v_arr']
                                                               > 0)

    df_out_BH_multiplier = df_out['FC_pass'].sum()

    qval_threshold = args['qval']

    df_out['p-value'] = 1.0
    df_out['BH_pass'] = False

    df_out = df_out.sort_values(by='score', ascending=False)
    df_out.loc[df_out['FC_pass'], 'BH_threshold'] = -np.log10(
        df_out.loc[df_out['FC_pass'], 'score'].rank(ascending=False,
                                                    method='max') *
        qval_threshold / df_out_BH_multiplier)
    df_out.loc[df_out['FC_pass'],
               'BH_pass'] = df_out.loc[df_out['FC_pass'],
                                       'score'] > df_out.loc[df_out['FC_pass'],
                                                             'BH_threshold']
    df_out.loc[df_out['FC_pass'],
               'p-value'] = 10**(-df_out.loc[df_out['FC_pass'], 'score'])
    score_threshold = df_out[df_out['BH_pass']]['score'].min()
    df_out.loc[df_out['FC_pass'],
               'BH_pass'] = df_out.loc[df_out['FC_pass'],
                                       'score'] >= score_threshold

    df_out.to_csv(path_or_buf=args['out'] + '_quant_full.tsv',
                  sep='\t',
                  index=False)

    df_out_f = df_out[(df_out['BH_pass']) & (df_out['FC_pass'])]

    df_out_f.to_csv(path_or_buf=args['out'] + '.tsv', sep='\t', index=False)

    genes_map = {}
    if args['d']:
        for prot, protseq in fasta.read(args['d']):
            try:
                prot_name = prot.split('|')[1]
            except:
                prot_name = prot
            try:
                gene_name = prot.split('GN=')[1].split(' ')[0]
            except:
                gene_name = prot
            genes_map[prot_name] = gene_name

    for z in set(df_out_f['dbname']):
        try:
            prot_name = z.split('|')[1]
        except:
            prot_name = z

        gene_name = genes_map.get(prot_name, prot_name)

        total_set.add(prot_name)
        total_set_genes.add(gene_name)

    logger.info('Total number of significantly changed proteins: %d',
                len(total_set))
    logger.info('Total number of significantly changed genes: %d',
                len(total_set_genes))

    f1 = open(args['out'] + '_proteins_for_stringdb.txt', 'w')
    for z in total_set:
        f1.write(z + '\n')
    f1.close()

    f1 = open(args['out'] + '_genes_for_stringdb.txt', 'w')
    for z in total_set_genes:
        f1.write(z + '\n')
    f1.close()
Пример #20
0
plt.rcParams["font.family"] = "Times New Roman"

parser = argparse.ArgumentParser(description='Use to find kmers')
parser.add_argument('-modifa', type=str, help='modifying_fasta', required=True)
parser.add_argument('-reffa', type=str, help='reference_fasta', required=True)
parser.add_argument('-save_way', type=str, help='path_of_save', required=True)
parser.add_argument('-image_save',
                    type=str,
                    help='image_save_path',
                    required=True)
args = parser.parse_args()
ref_path = args.reffa

mpl.rcParams['figure.figsize'] = [12, 8]
print('Analizing of motive abundance of E.coli ...\n')
genome = read(args.reffa)
syntheny_bloks = read(args.modifa)
alphablet = ['A', 'T', 'G', 'C']

new_genome = ''
for line in genome:
    new_genome += line.sequence


def create_lib(r):
    return set([''.join(i) for i in permutations(alphablet * r, r=r)])


all_ends = ''
for line in syntheny_bloks:
    all_ends += line.sequence
Пример #21
0
        check_list = {}
        checker = []
        for i in range(len(p)):
            if str(p['describtion'][i]) != 'nan':
                if str(p['describtion'][i]) != 'Description':
                    check_list[str(p['describtion'][i])] = {}
                    checker.append(str(p['describtion'][i]))
            else:
                break

        strains = args.refpa

        for sta in strains:
            if '.DS_Store' in sta:
                continue
            op_pa = read(args.refpa + '/All/{}'.format(sta))
            for line in op_pa:
                for num in check_list:
                    if num in line.description:
                        check_list[num] = [sta[:-12]]
        last = []
        for ind in check_list:
            last.append(check_list[ind])

    print('Write graph ...')
    for seq_id in graph:
        graph[seq_id]['blocks'].sort()
        blocks = graph[seq_id]['blocks']
        for i in range(len(blocks) - 1):
            if args.refpa != None:
                out.write('sb{}\tsb{}\t{}\t{}\t{}\t{}\n'.format(
Пример #22
0
ratio_all_kmers = []
ratio_all_GATC = []
all_p_values_kmers = []
all_p_values_GATC = []

for kmer in create_lib(4):
    if kmer != 'GATC':
        for org in organisms:
            cake = ''
            end_genome = ''
            mid_genome = ''
            if '.DS_Store' in org:
                continue

            print('Analyzin of {} ...'.format(org))
            mid_genome = read('{}{}/{}_middle_SB.fasta'.format(
                args.orgs, org, org))
            for line in mid_genome:
                mid_genome = line.sequence

            end_genome = read('{}{}/{}_ENDS_SB.fasta'.format(
                args.orgs, org, org))
            for line in end_genome:
                end_genome = line.sequence

            print('Counting {} ...'.format(kmer))
            lable_k.append(kmer)
            mid = mid_genome.count(kmer) * 100 / (len(mid_genome) - 5)
            end = end_genome.count(kmer) * 100 / (len(end_genome) - 5)
            ratio = end / mid
            ratio_all_kmers.append(ratio)
            cake += 'Frequency in the ends for every 4-mers\t{}\n'.format(end)
Пример #23
0
 def test_read_and_write_long(self):
     with tempfile.TemporaryFile(mode='r+') as new_fasta_file:
         fasta.write(fasta.read(self.fasta_file), new_fasta_file)
         new_fasta_file.seek(0)
         new_entries = list(fasta.read(new_fasta_file))
         self.assertEqual(new_entries, self.fasta_entries_long)
Пример #24
0
def run():
    parser = argparse.ArgumentParser(
        description='calculate NSAF for scavager results',
        epilog='''

    Example usage
    -------------
    $ scav2nsaf -S1 sample1_1_proteins.tsv sample1_n_proteins.tsv -S2 sample2_1_proteins.tsv sample2_n_proteins.tsv
    -------------
    ''',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-S1',
                        nargs='+',
                        metavar='FILE',
                        help='input files for S1 sample',
                        required=True)
    parser.add_argument('-S2',
                        nargs='+',
                        metavar='FILE',
                        help='input files for S2 sample')
    parser.add_argument('-S3',
                        nargs='+',
                        metavar='FILE',
                        help='input files for S3 sample')
    parser.add_argument('-S4',
                        nargs='+',
                        metavar='FILE',
                        help='input files for S4 sample')
    parser.add_argument('-u',
                        '--union',
                        help='pool the files together for the samples',
                        action='store_true')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '-a',
        '--autolabel',
        help='in union mode, derive sample labels from common name prefixes',
        action='store_true')
    group.add_argument(
        '--labels',
        nargs='+',
        metavar='LABEL',
        help='labels for samples in union mode (same number as samples)')
    parser.add_argument('-db',
                        metavar='FILE',
                        help='path to fasta file',
                        required=True)
    parser.add_argument('-out',
                        metavar='FILE',
                        help='name of nsaf output file',
                        default='nsaf_out.txt')
    parser.add_argument('-version',
                        action='version',
                        version='%s' %
                        (pkg_resources.require("scavager")[0], ))
    args = vars(parser.parse_args())

    samples = ['S1', 'S2', 'S3', 'S4']
    labels = args['labels'] if args['labels'] else samples

    df_final = False

    allowed_prots = set()
    allowed_peptides = set()

    for sample_num in samples:
        if args[sample_num]:
            for z in args[sample_num]:
                df0 = pd.read_csv(z, sep='\t')
                allowed_prots.update(df0['dbname'])

    for sample_num in samples:
        if args[sample_num]:
            for z in args[sample_num]:
                df0 = pd.read_csv(z.replace('_proteins.tsv', '_peptides.tsv'),
                                  sep='\t')
                allowed_peptides.update(df0['peptide'])

    for sample_num, label in zip(samples, labels):
        if args[sample_num]:
            if not args['union']:
                for z in args[sample_num]:
                    df1 = read_table(z, allowed_peptides, allowed_prots)
                    if df_final is False:
                        df_final = df1
                    else:
                        df_final = df_final.reset_index().merge(
                            df1.reset_index(), on='peptide',
                            how='outer')  #.set_index('peptide')
                        df_final.protein_x.fillna(value=df_final.protein_y,
                                                  inplace=True)
                        df_final['protein'] = df_final['protein_x']
                        df_final = df_final.drop(columns=[
                            'protein_x', 'protein_y', 'index_x', 'index_y'
                        ])
            else:
                if args['autolabel']:
                    label = os.path.commonprefix(args[sample_num]).rstrip('_')
                df1 = read_table(args[sample_num],
                                 allowed_peptides,
                                 allowed_prots,
                                 label=label)
                if df_final is False:
                    df_final = df1
                else:
                    df_final = df_final.reset_index().merge(
                        df1.reset_index(), on='peptide',
                        how='outer')  #.set_index('peptide')
                    df_final.protein_x.fillna(value=df_final.protein_y,
                                              inplace=True)
                    df_final['protein'] = df_final['protein_x']
                    df_final = df_final.drop(columns=[
                        'protein_x', 'protein_y', 'index_x', 'index_y'
                    ])

    df_final = df_final.set_index('peptide')
    df_final['proteins'] = df_final['protein']
    df_final = df_final.drop(columns=['protein'])
    cols = df_final.columns.tolist()
    cols.remove('proteins')
    cols.insert(0, 'proteins')
    df_final = df_final[cols]
    df_final.fillna(value='')

    cols = df_final.columns.difference(['proteins'])
    genres = df_final['proteins']  #.str.split(';')
    df_final = (
        df_final.loc[df_final.index.repeat(genres.str.len()), cols].assign(
            dbname=list(chain.from_iterable(genres.tolist()))))
    df_final = df_final.groupby('dbname').sum()
    df_final.reset_index(level=0, inplace=True)

    protsL = {}
    for p in fasta.read(args['db']):
        dbn = p[0].split()[0]
        protsL[dbn] = len(p[1])

    df_final['Length'] = df_final['dbname'].apply(lambda z: protsL[z])
    for cc in df_final.columns:
        if cc not in ['dbname', 'Length']:
            df_final[cc] = df_final[cc] / df_final['Length']
    for cc in df_final.columns:
        if cc not in ['dbname', 'Length']:
            df_final[cc] = df_final[cc] / df_final[cc].sum()
            df_final[cc] = df_final[cc].replace(0, np.nan)
            min_val = np.nanmin(df_final[cc].values)
            df_final[cc] = df_final[cc].replace(np.nan, min_val)
    df_final.drop(columns=[
        'Length',
    ], inplace=True)
    df_final.to_csv(args['out'], sep='\t', index=False)
#file for getting peptides from other enzymes

##import libraries
import matplotlib.pyplot as plt
import numpy as np
from pyteomics import fasta, parser

print('hello world')

databasePath = 'C:/Users/chris/OneDrive/Documents/bccrc/projectsRepository/sorensenLab/relatedToYbx1/design20200718_multipleProteaseSp3/uniprotHumanJul2020.fasta'

print('Cleaving the proteins with trypsin...')
uniquePeptides = set()
for description, sequence in fasta.read(databasePath):
    newPeptides = parser.cleave(sequence, 'K', min_length=6)
    uniquePeptides.update(newPeptides)
print('Done, {0} sequences obtained!'.format(len(uniquePeptides)))
Пример #26
0
import csv
from pyteomics.fasta import read
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
import sys

plt.style.use('seaborn')

parser = argparse.ArgumentParser('Mann-whitney test')
parser.add_argument('-stain', help='Path to organisms', required=True)
parser.add_argument('-coord', help='Path to coordinate', required=True)
parser.add_argument('-save_way', help='Path to save fold', required=True)
args = parser.parse_args()
print('Starting of analyze ...')

fasta = read(args.stain)
coord = open(args.coord)

read_tsv = pd.DataFrame(csv.reader(coord, delimiter="\t"))
read_tsv = read_tsv.T.drop(0)
Methyla_coordinates = []
for ind in range(len(read_tsv)):
    if ind == 0:
        continue
    Methyla_coordinates.append([int(read_tsv[0][ind]), int(read_tsv[1][ind])])

SB_coordinates = []
SBs = read(args.stain)
SB_dict = {}
for line in SBs:
    if 'GCF_000005845.2_ASM584v2' in line.description:
Пример #27
0
# In python 3, use 'from urllib.request import urlretrieve' instead
import gzip
import matplotlib.pyplot as plt
import numpy as np
from pyteomics import fasta, parser, mass, achrom, electrochem, auxiliary

if not os.path.isfile('yeast.fasta.gz'):
    print 'Downloading the FASTA file for Saccharomyces cerevisiae...'
    urlretrieve(
        'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/'
        'knowledgebase/proteomes/YEAST.fasta.gz', 'yeast.fasta.gz')
    print 'Done!'

print 'Cleaving the proteins with trypsin...'
unique_peptides = set()
for description, sequence in fasta.read(gzip.open('yeast.fasta.gz')):
    new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin'])
    unique_peptides.update(new_peptides)
print 'Done, {0} sequences obtained!'.format(len(unique_peptides))

peptides = [{'sequence': i} for i in unique_peptides]

print 'Parsing peptide sequences...'
for peptide in peptides:
    peptide['parsed_sequence'] = parser.parse(peptide['sequence'],
                                              show_unmodified_termini=True)
    peptide['length'] = parser.length(peptide['parsed_sequence'])
print 'Done!'

peptides = [peptide for peptide in peptides if peptide['length'] <= 100]
Пример #28
0
    res_df.columns = [
        "ID1", "ID2", "mass", "mz(z1)", "mz(z2)", "mz(z3)", "mz(z4)"
    ]
    res_df['ID1'] = res_df['ID1'].astype(int)
    res_df['ID2'] = res_df['ID2'].astype(int)
    b = time.time()
    print(np.round((b - a) / 60., 2))
    return (res_df)


#set input params
idname = "test"
fasta_file = "test.fasta"
min_length = 6
protease = "trypsin"
crosslinker_mass = 138.06807961

#read database
target = fasta.read(fasta_file)
decoys = fasta.read(fasta_file)

#generate peptides
target_df = digest(target, "Target", min_length=min_length, protease=protease)
decoy_df = digest(decoys, "Decoy", min_length=min_length, protease=protease)
df = pd.concat([target_df, decoy_df])
df = df.reset_index()
df.to_csv("{}_PeptidesDigest.csv".format(idname))
pairs = generate_peptidepairs(df, crosslinker_mass, idname)

#df_masses = pd.read_csv("test_masses.csv")
Пример #29
0
arg_parser.add_argument('-m', '--min', type=int, default=6,
                        help='minimal length of the peptide to report')
arg_parser.add_argument('-M', '--missed', type=int, default=0,
                        help='number of missed cleavages')
arg_parser.add_argument('-e', '--enz', type=str, default='trypsin',
                        help='protease to use for digestion')
arg_parser.add_argument('-v', '--verbosity', action='count',
                        help='increase output verbosity')

args = arg_parser.parse_args()

# TODO: Do it proper way - using os.path
out_file = args.fasta_file + '.peptides'                        
peptides = []

with fasta.read(args.fasta_file) as reader, open(out_file,'w') as writer:
    
    # Build a set of peptides for each fasta sequence 
    if args.verbosity >= 1:
        print 'Building digests...'
    for description, sequence in reader:
        peps = parser.cleave(sequence, parser.expasy_rules[args.enz], args.missed)
        peps = [x for x in peps if len(x) > args.min]
        writer.write('Peptides for {seq} ({enz} cleavage)\n'.format(
            seq=description, enz=args.enz))
        writer.write('...\t{n} missed cleavages\n'.format(n=args.missed))
        writer.write('\n'.join(peps)+'\n')
        peptides.append(set(peps))
        if args.verbosity >= 2:
            print '...\t{n} peptides for {prot}'.format(n=len(peps),prot=description)
        
Пример #30
0
orgsnisms = os.listdir(args.orgs)
part = ''
condition = ''
coordinate_of_split = ''
check_list = {}

#Maiking spike
for org in orgsnisms:
    if '.DS_Store' in org:
        continue
    strains = os.listdir('{}{}/All/'.format(args.orgs, org))
    for strain in strains:
        if '.DS_Store' in strain:
            continue
        check_list[strain[:-12]] = {}
        fna = read('{}{}/All/{}'.format(args.orgs, org, strain))
        counter = 0
        index = 0
        for line in fna:
            counter += len(line.sequence)
            if index != 0:
                check_list[strain[:-12]][line.description.split()[0]] = counter
            else:
                check_list[strain[:-12]][line.description.split()[0]] = 0
            index += 1

for organism in SBs:
    if '.DS_store' in organism:
        continue
    for an_org in coord_prot:
        if '.DS_Store' in an_org:
Пример #31
0
                continue

            if 'LCB' in miorg:
                print('Analyzing {} ...'.format(miorg[4:-6]))
                save_file = open(
                    args.save_way +
                    '{}_{}&{}.txt'.format(miorg[4:-6], k, k[::-1]), 'w')
            elif 'SB' in miorg:
                print('Analyzing {} ...'.format(miorg[3:-6]))
                save_file = open(
                    args.save_way +
                    '{}_{}&{}.txt'.format(miorg[3:-6], k, k[::-1]), 'w')
            else:
                print('Analyzing ...')
                save_file = open(args.save_way, 'w')
            fasta = read(args.genome + miorg)
            for SB in fasta:
                length = len(SB.sequence)
                if length > 1000:
                    rich = (
                        (SB.sequence.count(k[::-1]) + SB.sequence.count(k)) *
                        100) / (length - (len(k) - 1))
                    save_file.write(
                        split(split(SB.description)[0])[1] + '\t' +
                        str(len(SB.sequence)) + '\t' + str(rich) + '\n')

            save_file.close()
            print('Text file was saved!')
else:
    for k in mixer(kmer):
        if k in controler:
Пример #32
0
def prot_gen(args):
    db = args['d']

    with fasta.read(db) as f:
        for p in f:
            yield p
Пример #33
0
ref_path = os.listdir(args.reffa)
k = args.k

if args.data_type == 'fold' and k <= 3:
    while not k > 3:
        if k > 2:
            mpl.rcParams['font.size'] = 5
        if k > 1:
            mpl.rcParams['figure.figsize'] = [25, 8]
        else:
            mpl.rcParams['figure.figsize'] = [15, 8]
        for path in ref_path:
            if '.DS_Store'  in path:
                continue
            print('Analizing of {}\n'.format(path))
            genome = read(args.reffa + '{}/{}_middle_SB.fasta'.format(path, path))
            syntheny_bloks = read(args.modifa + '{}/{}_ENDS_SB.fasta'.format(path, path))
            alphablet = ['A', 'T', 'G' ,'C']

            mid_genome = '' 
            for line in genome:
                mid_genome += line.sequence

            def create_lib(r):
                return set([''.join(i) for i in permutations(alphablet * r, r=r)])

            all_ends = ''
            for line in syntheny_bloks:
                all_ends += line.sequence

            print('Waiting...\n')
Пример #34
0
def get_cond_peps(file_in, fasta_file):
    cols = ['Sequence', 'Proteins', 'Experiment', 'Intensity (normalized)']
    raw, headers, inds = openfile(file_in, cols)

    unique_prots = []
    unique_conds = []
    unique_bio_reps = ['']
    count = 0
    prot_seqs = []
    no_fasta = []

    for a in raw:
        prots = a[inds[1]].split(';')
        count += 1
        for b in prots:
            prot = b.split('CON__')[-1]
            prot_seq = ''
            if prot not in unique_prots and [prot] not in no_fasta:
                fasta_test = False
                #								print prot
                for description, sequence in fasta.read(fasta_file):
                    if '|' + prot + '|' in description:
                        fasta_test = True
                        print 'Fasta match found:', prot, count, '/', len(raw)
                        prot_seq = sequence
                if fasta_test == True:
                    unique_prots.append(prot)
                    prot_seqs.append([prot, prot_seq])
                else:
                    print 'No FASTA match:', prot
                    no_fasta.append([prot])

        if a[inds[2]] not in unique_conds:
            unique_conds.append(a[inds[2]])

        #if a[inds[4]] not in unique_bio_reps:
        #		unique_bio_reps.append(a[inds[4]])

    headers_out = [
        'Protein', 'Experiment', 'Passing Peptides', 'FASTA seq',
        'Intensities (normalized)'
    ]
    out = []

    for prot in unique_prots:
        for cond in unique_conds:
            for bio_rep in unique_bio_reps:

                pep_list = []
                intens_list = []
                for a in raw:
                    if prot in a[inds[1]]:
                        if cond == a[inds[2]]:
                            if 1 == 1:
                                pep_list.append(a[inds[0]])
                                intens_list.append(a[inds[3]])

                pep_str = ''
                for i in pep_list:
                    pep_str += i + ';'
                pep_str = pep_str[:-1]

                intens_str = ''
                for i in intens_list:
                    intens_str += i + ';'
                intens_str = intens_str[:-1]

                for b in prot_seqs:
                    if b[0] == prot:
                        prot_seq = b[1]

                entry = [prot, cond, pep_str, prot_seq, intens_str]
                out.append(entry)
                print entry

    file_out = file_in.split('.txt')[0] + '_passpeps.txt'
    savefile(file_out, out, headers_out)
Пример #35
0
    description was replaced by the gene name.
    '''
    descr, seq = e
    m = GN_PATTERN.search(descr)
    new_descr = m.group('gene') or descr
    return Protein(new_descr, seq)
    
parser = argparse.ArgumentParser()
parser.add_argument('files', type=str, nargs='+',
                        help='.fasta files containing plink output')
parser.add_argument('-o', '--output', type=str, default=None,
                        help='file to write output to.')
parser.add_argument('--gname', action='store_true',
                        help='only leave gene name (GN) in the output fasta header')                        
parser.add_argument('-v', '--verbosity', action='count',
                        help='increase output verbosity')

args = parser.parse_args()
unique_entries = set()
for filename in args.files:
    if args.verbosity > 0:
        print 'Processing {0}...'.format(filename)
    for entry in fasta.read(open(filename)):
        if args.gname:
            entry = get_gene_name(entry)
        unique_entries.update({entry,})
    if args.verbosity > 1:
        print '\t... found {0} unique entries so far'.format(len(unique_entries))

fasta.write(unique_entries, output=args.output)