Пример #1
0
def msgf2seq_file(filepath, fasta_file, msb_psms):
    """
    msb_psms: set of spectid_peptidesequence
    """
    def parse_spec_pep_row(r):
        # get spec_pep from _best file format
        parsed = '_'.join(r[0].split('.')[:2] + [r[4]])
        #print parsed
        return parsed
    usedir,fin = os.path.split(filepath)
    # Get the sample filename from the first item of the third line
    fout = next(it.islice(ut.load_tab_file(filepath),2,3))[0].split('.')[0]
    in_gen = ut.load_tab_file(filepath)
    in_gen.next(); in_gen.next() # skip 2 lines
    p2g = seqs.prots2genes(fasta_file)
    g2p = ut.dict_inverse(p2g)
    fout = os.path.join(usedir, '.'.join([fout, fin.split('.')[-1] ,
        'sequestformat']))
    search = searches[filepath.split('.')[-1]]
    print "Converting/filtering; Search:", search
    output = (msgfbest2sequest_line(r,p2g, g2p, search) for r in in_gen 
            if parse_spec_pep_row(r) in msb_psms)
    print "Writing", fout
    ut.write_tab_file(output, fout)
    return fout
Пример #2
0
def fnet_names(fnet_file):
    filename = ut.proj_path('fnet_path',fnet_file)
    first = ut.load_tab_file(filename).next()
    nfields = len(first)-2
    if nfields > 1:
        return [l[0].strip() if l[0].find('=')==-1 else
                l[0].split('=')[0].strip() for l in
                ut.load_tab_file(ut.pre_ext(filename,'_names'))]
    else:
        return None #means there is only one data column.
Пример #3
0
def load_elution(fname, getname=True):
    # expected file structure:
    # first col: gene id
    # second col: treat differently if 2nd col header is 'Total' or
    # 'Description'
    # remaining cols: elution profile data
    lines = [l for l in ut.load_tab_file(fname)]
    # final row: total count in msblender output; don't skip in cuihong's data
    skip_final_row = lines[-1][0][0] == "#"
    rows = lines[1:-1] if skip_final_row else lines[1:]
    fractions = [f for f in lines[0][1:]]
    if fractions[0].lower() in ["total", "totalcount", "description"]:
        start_data_col = 2
        fractions.remove(fractions[0])
    else:
        start_data_col = 1
    mat = np.matrix([row[start_data_col:] for row in rows], dtype="float32")
    prots = [row[0] for row in rows]
    elut = Struct(mat=mat, prots=prots, fractions=fractions, filename=fname, filename_original=fname)
    if start_data_col == 2:
        col2name_vals = [row[1] for row in rows]
        elut.column2vals = col2name_vals
    if getname:
        elut.name = os.path.basename(fname).split(".")[0]
    return elut
Пример #4
0
def load_elution(fname, getname=True):
    # expected file structure:
    # first col: gene id
    # second col: treat differently if 2nd col header is 'Total' or
    # 'Description'
    # remaining cols: elution profile data
    lines = [l for l in ut.load_tab_file(fname)]
    # final row: total count in msblender output; don't skip in cuihong's data
    skip_final_row = (lines[-1][0][0] == '#')
    rows = lines[1:-1] if skip_final_row else lines[1:]
    fractions = [f for f in lines[0][1:]]
    if fractions[0].lower() in ['total', 'totalcount', 'description']:
        start_data_col = 2
        fractions.remove(fractions[0])
    else:
        start_data_col = 1
    mat = np.matrix([row[start_data_col:] for row in rows],dtype='float32')
    prots = [row[0] for row in rows]
    elut = Struct(mat=mat, prots=prots, fractions=fractions, filename=fname,
                  filename_original=fname)
    if start_data_col == 2:
        col2name_vals = [row[1] for row in rows]
        elut.column2vals = col2name_vals
    if getname: elut.name = os.path.basename(fname).split('.')[0]
    return elut
Пример #5
0
def parse_msb_psms(fname):
    item1s = (line[0] for line in ut.load_tab_file(fname))
    # ex: WAN110811_HCW_HEK293NE_P1D08.01387.2.SGNLTEDDKHNNAK
    item1s.next() # skip 1 line
    spect_pep = ('_'.join([sample,spect,pep]) for sample,spect,_,pep in 
            (i1.split('.') for i1 in item1s))
    return set(spect_pep)
Пример #6
0
def munge_malov(fdata):
    # load from proper columns
    cxs = {}
    for line in ut.load_tab_file(fdata):
        g,c = line[:2]
        cxs.setdefault(c,set([])).add(g)
    # remove (many) singletons
    for c,gset in cxs.items():
        if len(gset) < 2: 
            del cxs[c]
    ints = pd.PairDict([])
    # interpret "approved"/"provisional"/"temporary"
    def scorec(c):
        if c[0] == 'A':
            return 10
        elif c[0] == 'P':
            return 3
        elif c[0] == 'T':
            return 1
        else:
            print c[0]
            return 1
    for c,gset in cxs.items():
        score = scorec(c)
        for pair in it.combinations(gset,2):
            assert not ints.contains(pair), "ints contains %s" % pair[0]+pair[1]
            ints.append(pair, score)
    return ints
Пример #7
0
def load_weka_filtered_tpairs(fname, min_score=None):
    tested_pairs = [('','',r[0],true_to_1(r[1])) for r in
        ut.load_tab_file(fname)]
    tested_pairs.sort(key=lambda x:x[2], reverse=True)
    if min_score is not None:
        tested_pairs = [t for t in tested_pairs if float(t[2])>=min_score]
    return tested_pairs
Пример #8
0
def load_pep2prots(filename, sep='|'):
    """
    Separator is '|' for most of andrew's files, but '&' for Nv and Xl.
    """
    pep2prots = dict(((line[0], set([p.split()[0] for p in
        line[1].split(sep)])) for line in ut.load_tab_file(filename)))
    print "First 10 peptide mappings:", pep2prots.items()[:10]
    return pep2prots
Пример #9
0
def load_corum(fname, filter_methods, do_dedupe):
    """
    Returns a list of tuples: (name, set(uniprotIDs), species)
    """
    lines = [l[:7] for l in ut.load_tab_file(fname, sep=';')][1:]
    cxs = [(name, set(prots.split(',')), species, method) 
            for _,name,_,species,prots,_,method in lines]
    if filter_methods:
        print "Filtering corum methods."
        keep_methods = set([x[0] for x in
            (ut.load_tab_file(ut.proj_path('corum_methods'))) if int(x[3])==1])
        cxs = [(n,p,s) for n,p,s,methods in cxs 
                if (len([m for m in methods.split('|') 
                    if m.split('-')[0].strip() in keep_methods]) > 0)]
    else:
        cxs = [(n,p,s) for n,p,s,m in cxs]
    return cxs
Пример #10
0
def elut_p2g(fname, p2g, suffix='_fix'):
    lines = ut.load_tab_file(fname)
    def process(lines):
        for items in lines:
            if items[0][0] != '#':
                yield [p2g[items[0]]] + list(items[1:])
            else:
                yield items
    ut.write_tab_file(process(lines), fname+suffix)
Пример #11
0
def munge_original(fdata, column_inds, fnames, fout, first_names=1):
    """
    Keep selected columns, replace 'NA' with '?', remove empty rows.
    Ids (first 2 columns) are kept automatically.
    For column inds, start with 0 for scores.
    Keep the same columns from the fnames file so I have a record of it.
    """
    out = []
    default = ['?'] * len(column_inds)
    for l in ut.load_tab_file(fdata):
        ids = list(l[:2])
        newdata = [l[i+2] if l[i+2]!='NA' else '?' for i in range(len(l)) if i
            in column_inds]
        if newdata != default:
            out.append(ids + newdata)
    ut.write_tab_file(out, fout)
    names = [l for i,l in enumerate( list( ut.load_tab_file(
        fnames))[first_names:]) if i in column_inds]
    ut.write_tab_file(names, ut.pre_ext(fout, '_names')) 
Пример #12
0
def load_complexes_multiline(filename):
    """
    (Usually for 'clean' complexes).
    Load complexes in a file in the style of supp table 3: complexid,
    complexname, singlemember.
    """
    filename = os.path.expanduser(filename)
    complexes = {}
    for l in ut.load_tab_file(filename):
        complexes.setdefault(l[1],set([])).add(l[2])
    return complexes
Пример #13
0
def mq2elut(fname, quant='iBAQ'):
    lines = [l for l in ut.load_tab_file(fname)]
    # want eg 'iBAQ WAN...', not 'iBAQ L WAN...'
    inds = [i for i,val in enumerate(lines[0]) 
            if re.match('^%s\s\w{2}' % quant,val) is not None]
    #prots = [[p.split()[0][1:] for p in ps.split(';')] 
            #for ps in [l[0] for l in lines[1:]]]
    # for now just using the "majority protein"
    prots = [p.split()[0][1:] for p in [l[1] for l in lines[1:]]]
    output = [[lines[0][0]] + [lines[0][i] for i in inds]] + \
            [[p] + [l[i] for i in inds] for p,l in zip(prots, lines[1:])]
    ut.write_tab_file(output, ut.pre_ext(fname, '_mq_%s' % quant))
Пример #14
0
def orth_pid2geneid(fname, p2g):
    lines = ut.load_tab_file(fname)
    def process(lines):
        def replistp2g(pclist):
            return ' '.join([el if i%2 else p2g[el] 
                            for i,el in enumerate(pclist)])
        for n,items in enumerate(lines):
            if n==1:
                yield items
            else:
                newitems = list(items[:2])
                for i in 2,3:
                    newitems.append(replistp2g(items[i].split()))
                yield newitems
    ut.write_tab_file(process(lines), fname+'_fix')
Пример #15
0
def load_complexes_singleline(filename, startindex=1):
    """
    (Usually for 'ppi' overlapping complexes)
    """
    # load corum-type file into a dictionary
    # complexes: dict{complexid: set([protein1, protein2,...]), .. }
    # first col: complex id
    filename = os.path.expanduser(filename)
    complexes = {}
    # PPI complex set from traver has duplicate complex names with different
    # members.  Using this approach means all the members from any lines with
    # that complex's name get added.
    for l in ut.load_tab_file(filename):
        for i in l[startindex:]:
            complexes.setdefault(l[0],set([])).add(i)
    return complexes
Пример #16
0
def score_arr_ext(arr, species, ext_key):
    """
    Key_or_data: either a string matching one of the keys for ext data in
    config.py, or a tuple of (name,data) where data is a sequence of (id1, id2,
    score), and the sequence can be a generator.
    fnet_cols: list of columns or first 2 letters to include, eg ['HS','CE']
    """
    ext_file = ut.config()[ext_key]
    conv_dict = convdict_from_fname(species, ext_file)
    filename = ut.proj_path('fnet_path', ext_file)
    stored_names = fnet_names(ext_file) # None if only one data column.
    names = stored_names if stored_names else [ext_key]
    data_dict = load_net(ut.load_tab_file(filename))
    print 'External data file: %s; size: %s; cols: %s' % (ext_file,
            len(data_dict), len(names))
    score_arr(arr, species, names, data_dict, conv_dict)
Пример #17
0
def load_ogroups(from_sp, to_sp, fname=None):
    """
    Load an inparanoid table.Sp1-Sp2 file into a list of orthogroups, where
    each orthogroup is a tuple containing 1) a list of proteins in sp1 and 2) a
    list of proteins in sp2.
    Eg: [([HsProtA, HsProtB,..],[CeProtA,CeProtB,..]), ([..],[..]), ...]
    """
    # Skip header row; protein ids alternate with meaningless conf scores in
    # columns 2 and 3 in the order of the filename
    if fname is None:
        fname, swap_order = orth_fname(from_sp, to_sp)
    else:
        fname, swap_order = fname, False
    (from_ind, to_ind) = (2,3) if not swap_order else (3,2)
    ogroups = [([p for p in row[from_ind].split()[::2]],[p for p in
            row[to_ind].split()[::2]]) for row in ut.load_tab_file(fname)][1:]
    return ogroups
Пример #18
0
def load_ogroups(from_sp, to_sp, fname=None):
    """
    Load an inparanoid table.Sp1-Sp2 file into a list of orthogroups, where
    each orthogroup is a tuple containing 1) a list of proteins in sp1 and 2) a
    list of proteins in sp2.
    Eg: [([HsProtA, HsProtB,..],[CeProtA,CeProtB,..]), ([..],[..]), ...]
    """
    # Skip header row; protein ids alternate with meaningless conf scores in
    # columns 2 and 3 in the order of the filename
    if fname is None:
        fname, swap_order = orth_fname(from_sp, to_sp)
    else:
        fname, swap_order = fname, False
    (from_ind, to_ind) = (2, 3) if not swap_order else (3, 2)
    ogroups = [([p for p in row[from_ind].split()[::2]],
                [p for p in row[to_ind].split()[::2]])
               for row in ut.load_tab_file(fname)][1:]
    return ogroups
Пример #19
0
def transpose(d, fin, fout):
    sys.path.append(d+'/..')
    import utils as ut
    lines = [l for l in ut.load_tab_file(fin)]
    if lines[-1][0].startswith('#'):
        #ignore comments, such as last line in spcount output
        lines = lines[:-1]
        print "skipping last line"
    cols = ut.zip_exact(*lines) #zip messes up if these files aren't neat
    # _After_ zipping, get rid of the column 1 header--R doesn't like it.
    col0list = list(cols[0])
    print col0list[0][0] 
    assert (col0list[0][0] == '#' or col0list[0] == 'Locus') # make sure we're removing what we should be
    col0list.remove(col0list[0])
    cols[0] = tuple(col0list)
    col2title = cols[1][0].lower()
    # get rid of the total/descr column
    if col2title.find('total') > -1 or col2title.find('descr') > -1:
        cols.remove(cols[1])
        print "removing second column--extraneous"
    ut.write_tab_file(cols, fout)
Пример #20
0
def pairs(fname):
    return [list(e) for e in ut.load_tab_file(fname)]
Пример #21
0
from __future__ import division
import os
from os.path import abspath
import sys
sys.path.append(os.path.dirname(abspath(__file__))+'/../')
import utils as ut

def move(fname, fmap):
    """
    For renaming a file based on a mapping old_fname to new_fname.
    NOT for moving a file to mapped folder.  That's the other script.
    """
    basename = ut.shortname(fname)
    fext = os.path.splitext(fname)[1]
    fdir = os.path.split(fname)[0]
    if basename in fmap:
        newname = os.path.join(fdir,fmap[basename] + fext)
        print "moving", fname, newname
        os.rename(fname, newname)
    else:
        print "not found", fname

if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit("usage: python blah.py mapfile.txt filename(s)") 
    fname_map = sys.argv[1]
    filenames = sys.argv[2:]
    fmap = dict(ut.load_tab_file(fname_map))
    for f in filenames:
        move(f, fmap)
Пример #22
0
def keep_unique_lines_by_column(fnames, column=0):
    lines_dict = collect_dict((line for f in fnames 
        for line in ut.load_tab_file(f)))
    return (random.choice(list(lines)) for key,lines in lines_dict.items())
        print "File not found:", fpath
        return
    basename = ut.shortname(fpath)
    if remove_final_underscore:
        basename = ('_'.join(basename.split('_')[:3]) 
                if len(basename.split('_'))>2 else basename)
    if not basename in file2folder:
        print "No mapping for file:", fpath, basename
        return 
    folder = file2folder[basename]
    if not os.path.exists(folder):
        print "Creating directory", folder
        os.mkdir(folder)
    newpath = os.path.join(folder, os.path.split(fpath)[1])
    if os.path.exists(newpath):
        print "File exists:", newpath
    else:
        print "Moving to", newpath
        os.rename(fpath, newpath)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit("usage: python blah.py files2folders.txt remove_final_underscore{0,1} filename(s)") 
    fname_map = sys.argv[1]
    remove_final_underscore = int(sys.argv[2])
    print "Remove final underscore", "yes" if remove_final_underscore else "no"
    filenames = sys.argv[3:]
    files2folders = dict(ut.load_tab_file(fname_map))
    for f in filenames:
        maybe_move(f, files2folders, remove_final_underscore)