Пример #1
0
def msgf2seq_file(filepath, fasta_file, msb_psms):
    """
    msb_psms: set of spectid_peptidesequence
    """
    def parse_spec_pep_row(r):
        # get spec_pep from _best file format
        parsed = '_'.join(r[0].split('.')[:2] + [r[4]])
        #print parsed
        return parsed
    usedir,fin = os.path.split(filepath)
    # Get the sample filename from the first item of the third line
    fout = next(it.islice(ut.load_tab_file(filepath),2,3))[0].split('.')[0]
    in_gen = ut.load_tab_file(filepath)
    in_gen.next(); in_gen.next() # skip 2 lines
    p2g = seqs.prots2genes(fasta_file)
    g2p = ut.dict_inverse(p2g)
    fout = os.path.join(usedir, '.'.join([fout, fin.split('.')[-1] ,
        'sequestformat']))
    search = searches[filepath.split('.')[-1]]
    print "Converting/filtering; Search:", search
    output = (msgfbest2sequest_line(r,p2g, g2p, search) for r in in_gen 
            if parse_spec_pep_row(r) in msb_psms)
    print "Writing", fout
    ut.write_tab_file(output, fout)
    return fout
Пример #2
0
def process_raw_wan(
    f_source, f_dest=None, first_col_element=1, first_data_col=1, end_description_col=True, first_data_row=1
):
    # specific to cuihong's files, and tries to handle the differences seen in them
    # always keeps the first column as variable name
    # processes first column, splitting it and keeping first_col_element
    # for the array, keeps columns [first_data_col:end_data_col]. None works.
    # textlines = [textline for textline in open(f_source)]
    # # handle unix-unreadable linebreaks from excel
    # if len(textlines) == 1:
    #     if textlines[0].find('\r\n') > -1:
    #         textlines = textlines[0].split('\r\n')
    lines = [line.strip().split("\t") for line in open(f_source) if line.strip() != ""]
    # simple: one step at a time.
    # column manipulation first.
    if end_description_col:
        lines = [[l[0]] + [l[-1]] + l[first_data_col:-1] for l in lines]
    else:
        lines = [[l[0]] + l[first_data_col:] for l in lines]
    # variable name manipulation
    if first_col_element is not None:
        # manipulate gene name in all but header row. skip anything btw header
        # and first_data_row.
        lines = [lines[0]] + [[l[0].split("|")[first_col_element]] + l[1:] for l in lines[first_data_row:]]
    # rename file
    if f_dest is None:
        split = os.path.splitext(f_source)
        f_dest = split[0] + "_proc" + split[1]
    ut.write_tab_file(lines, f_dest)
Пример #3
0
def process_raw_wan(f_source, f_dest=None, first_col_element=1,
                    first_data_col=1, end_description_col=True,
                    first_data_row=1):
    # specific to cuihong's files, and tries to handle the differences seen in them
    # always keeps the first column as variable name
    # processes first column, splitting it and keeping first_col_element
    # for the array, keeps columns [first_data_col:end_data_col]. None works.
    # textlines = [textline for textline in open(f_source)]
    # # handle unix-unreadable linebreaks from excel
    # if len(textlines) == 1:
    #     if textlines[0].find('\r\n') > -1:
    #         textlines = textlines[0].split('\r\n')
    lines = [line.strip().split('\t') for line in open(f_source)if line.strip()!='']
    # simple: one step at a time.
    # column manipulation first.
    if end_description_col:
        lines = [[l[0]] + [l[-1]] + l[first_data_col:-1] for l in lines]
    else:
        lines = [[l[0]] + l[first_data_col:] for l in lines]
    # variable name manipulation
    if first_col_element is not None:
        # manipulate gene name in all but header row. skip anything btw header
        # and first_data_row.
        lines = [lines[0]] + [[l[0].split('|')[first_col_element]] +
                    l[1:] for l in lines[first_data_row:]]
    # rename file
    if f_dest is None:
        split = os.path.splitext(f_source)
        f_dest = split[0] + '_proc' + split[1]
    ut.write_tab_file(lines, f_dest)
Пример #4
0
def elut_p2g(fname, p2g, suffix='_fix'):
    lines = ut.load_tab_file(fname)
    def process(lines):
        for items in lines:
            if items[0][0] != '#':
                yield [p2g[items[0]]] + list(items[1:])
            else:
                yield items
    ut.write_tab_file(process(lines), fname+suffix)
Пример #5
0
def cuihong_fasta_to_clean(fname, outname):
    """
    Get rid of all the reverse or shuffleds ('rm' instead of 'sp') and anything
    else that doesn't start with '>sp'.  Keep only the uniprot identifier.
    """
    lol = _load_prots_to_lol(fname)
    good_seqs = [['>'+p[0].split('|')[1]]+p[1:] for p in lol 
            if p[0][:3] == '>sp' or p[0][:3]== '>tr']
    ut.write_tab_file([i for l in good_seqs for i in l ], outname, islist=True) #flatten
Пример #6
0
def multi_identities(input_fname, out_dir):
    input_list = ut.load_lol(input_fname)
    for desc, prots_fname, source_fasta, odict, target in input_list:
        print "%s, proteins: %s\n source: %s\n odict: %s\ntarget: %s" % (desc,
                prots_fname, source_fasta, odict, target)
        prots = ut.load_list(prots_fname)
        sims = all_identities(prots, odict, source_fasta, target)
        out_fname = os.path.join(out_dir,
                ut.shortname(target).split('.')[0] + "_" + desc + ".txt")
        ut.write_tab_file(sims, out_fname, islist=True)
Пример #7
0
def mq2elut(fname, quant='iBAQ'):
    lines = [l for l in ut.load_tab_file(fname)]
    # want eg 'iBAQ WAN...', not 'iBAQ L WAN...'
    inds = [i for i,val in enumerate(lines[0]) 
            if re.match('^%s\s\w{2}' % quant,val) is not None]
    #prots = [[p.split()[0][1:] for p in ps.split(';')] 
            #for ps in [l[0] for l in lines[1:]]]
    # for now just using the "majority protein"
    prots = [p.split()[0][1:] for p in [l[1] for l in lines[1:]]]
    output = [[lines[0][0]] + [lines[0][i] for i in inds]] + \
            [[p] + [l[i] for i in inds] for p,l in zip(prots, lines[1:])]
    ut.write_tab_file(output, ut.pre_ext(fname, '_mq_%s' % quant))
Пример #8
0
def orth_pid2geneid(fname, p2g):
    lines = ut.load_tab_file(fname)
    def process(lines):
        def replistp2g(pclist):
            return ' '.join([el if i%2 else p2g[el] 
                            for i,el in enumerate(pclist)])
        for n,items in enumerate(lines):
            if n==1:
                yield items
            else:
                newitems = list(items[:2])
                for i in 2,3:
                    newitems.append(replistp2g(items[i].split()))
                yield newitems
    ut.write_tab_file(process(lines), fname+'_fix')
Пример #9
0
def write_elution(elut, fname, forR=False):
    """
    Write out an elution in the spcount format
    $ProtID\tTotalCount\tCol1....
    """
    # First eliminate empty protein rows
    nonzeros = np.sum(np.array(elut.mat), axis=1) > 0
    arr = np.array(elut.mat[nonzeros, :])
    prots = list(np.array(elut.prots)[nonzeros])
    if not forR:
        header = "#ProtID TotalCount".split() + elut.fractions
        data = [[prots[i], np.sum(arr[i, :])] + arr[i, :].tolist() for i in range(len(prots))]
    else:  # R: no column header for first column, and transpose
        header = prots
        data = [[elut.fractions[i]] + arr[:, i].tolist() for i in range(len(elut.fractions))]
    ut.write_tab_file([header] + data, fname)
Пример #10
0
def export_idconvert(ppis, dict_cxlabels, fname):
    cxs_labeled = set([])
    pfx_convert = []
    for p in ppis:
        for i in 0,1:
            combid = p[i]
            cxid = combid.split('_')[0]
            pid = '_'.join(combid.split('_')[1:]) #in case '_' in id, eg for Sp
            if cxid not in cxs_labeled:
                cxlabel = dict_cxlabels[cxid]
                cxs_labeled.add(cxid)
            else:
                cxlabel = ''
            pfx_convert.append([combid, pid, cxid, cxlabel])
    pfx_convert = [['nodeid', 'ENSGID', 'complexid', 'ComplexLabel']] \
            + pfx_convert
    ut.write_tab_file(pfx_convert, ut.pre_ext(fname,'pfx_convert'))
Пример #11
0
def write_elution(elut, fname, forR=False):
    """
    Write out an elution in the spcount format
    $ProtID\tTotalCount\tCol1....
    """
    # First eliminate empty protein rows
    nonzeros = np.sum(np.array(elut.mat),axis=1)>0
    arr = np.array(elut.mat[nonzeros,:])
    prots = list(np.array(elut.prots)[nonzeros])
    if not forR:
        header = "#ProtID TotalCount".split() + elut.fractions
        data = [[prots[i], np.sum(arr[i,:])] + arr[i,:].tolist() for i in
                range(len(prots))]
    else: #R: no column header for first column, and transpose
        header = prots
        data = [[elut.fractions[i]] + arr[:,i].tolist() for i in
                range(len(elut.fractions))]
    ut.write_tab_file([header] + data, fname)
Пример #12
0
def munge_original(fdata, column_inds, fnames, fout, first_names=1):
    """
    Keep selected columns, replace 'NA' with '?', remove empty rows.
    Ids (first 2 columns) are kept automatically.
    For column inds, start with 0 for scores.
    Keep the same columns from the fnames file so I have a record of it.
    """
    out = []
    default = ['?'] * len(column_inds)
    for l in ut.load_tab_file(fdata):
        ids = list(l[:2])
        newdata = [l[i+2] if l[i+2]!='NA' else '?' for i in range(len(l)) if i
            in column_inds]
        if newdata != default:
            out.append(ids + newdata)
    ut.write_tab_file(out, fout)
    names = [l for i,l in enumerate( list( ut.load_tab_file(
        fnames))[first_names:]) if i in column_inds]
    ut.write_tab_file(names, ut.pre_ext(fout, '_names')) 
Пример #13
0
def transpose(d, fin, fout):
    sys.path.append(d+'/..')
    import utils as ut
    lines = [l for l in ut.load_tab_file(fin)]
    if lines[-1][0].startswith('#'):
        #ignore comments, such as last line in spcount output
        lines = lines[:-1]
        print "skipping last line"
    cols = ut.zip_exact(*lines) #zip messes up if these files aren't neat
    # _After_ zipping, get rid of the column 1 header--R doesn't like it.
    col0list = list(cols[0])
    print col0list[0][0] 
    assert (col0list[0][0] == '#' or col0list[0] == 'Locus') # make sure we're removing what we should be
    col0list.remove(col0list[0])
    cols[0] = tuple(col0list)
    col2title = cols[1][0].lower()
    # get rid of the total/descr column
    if col2title.find('total') > -1 or col2title.find('descr') > -1:
        cols.remove(cols[1])
        print "removing second column--extraneous"
    ut.write_tab_file(cols, fout)
Пример #14
0
def exported_diff(cy_basefile, cy_difffile, col_header, diff_ppis=None,
        justids=False):
    """
    Makes a new cy_ file labeling whether that interaction is also found in the
    cy_difffile (or the diff_ppis--pass None for cy_difffile in that case).
    """
    def cy_ppi_to_pair(p):
        return (p[0].split('_')[1], p[1].split('_')[1])
    if cy_difffile is not None:
        pd_diff = pd.PairDict([cy_ppi_to_pair(p) 
            for p in ut.load_lot(cy_difffile)[1:]])
    else:
        pd_diff = pd.PairDict(diff_ppis)
    header = ut.load_lol(cy_basefile)[0]
    lines = ut.load_lol(cy_basefile)[1:]
    if justids:
        lines = [l[:2] for l in lines]
        header = header[:2]
    header += [col_header]
    ut.write_tab_file([r + [pd_diff.contains(cy_ppi_to_pair(r))] for r in
        lines], ut.pre_ext(cy_basefile, col_header), header=header)
Пример #15
0
def ensembl_prots_to_genes(fname, bar_split=None, second_split=False, 
        only_geneid_on_line=False, pid_replace=False):
    """
    Take a protein sequence file and keep only the longest sequence for each
    gene.  Designed for ensembl fasta sequence downloads.  Purpose is to run
    inparanoid for orthology only on the longest sequence per gene so as to
    have gene-based orthology, which is cleaner to understand.
    bar_split: use 1 for Dd, Sp, leave out for standard ensembl
    for Sp, use second_split=True
    pid_replace only works the first time--don't try it again after replacement
    """
    genes_dict = _longest_seqs_dep(fname, bar_split, 
            only_geneid_on_line=only_geneid_on_line)
    if pid_replace:
        for geneid, lines in genes_dict.items():
            items = lines[0].split(' ')
            protid = items[0].strip().strip('>')
            items[0] = '>' + geneid
            items.append('protein:' + protid)
            lines[0] = ' '.join(items)
    genes_list = reduce(operator.add,[lines for g,lines in genes_dict.items()])
    ut.write_tab_file(genes_list, fname+'_longest', islist=True)
Пример #16
0
def write_combined(fnames):
    output = keep_unique_lines_by_column(fnames)
    fout = '.'.join(fnames[0].split('.')[:-2]) + '.combined'
    ut.write_tab_file(output,fout)
    return fout
Пример #17
0
def export_cxs(tested, fname, negmult):
    ut.write_tab_file([(t[0], t[1], ut.rescale(float(t[2]),negmult)) for t in
        tested], fname)
Пример #18
0
def export_ints(tested, fname, negmult, header):
    ut.write_tab_file([header] + [[t[0], t[1], ut.rescale(float(t[2]),negmult)]
        + list(t[3:]) for t in tested], fname)