def msgf2seq_file(filepath, fasta_file, msb_psms): """ msb_psms: set of spectid_peptidesequence """ def parse_spec_pep_row(r): # get spec_pep from _best file format parsed = '_'.join(r[0].split('.')[:2] + [r[4]]) #print parsed return parsed usedir,fin = os.path.split(filepath) # Get the sample filename from the first item of the third line fout = next(it.islice(ut.load_tab_file(filepath),2,3))[0].split('.')[0] in_gen = ut.load_tab_file(filepath) in_gen.next(); in_gen.next() # skip 2 lines p2g = seqs.prots2genes(fasta_file) g2p = ut.dict_inverse(p2g) fout = os.path.join(usedir, '.'.join([fout, fin.split('.')[-1] , 'sequestformat'])) search = searches[filepath.split('.')[-1]] print "Converting/filtering; Search:", search output = (msgfbest2sequest_line(r,p2g, g2p, search) for r in in_gen if parse_spec_pep_row(r) in msb_psms) print "Writing", fout ut.write_tab_file(output, fout) return fout
def fnet_names(fnet_file): filename = ut.proj_path('fnet_path',fnet_file) first = ut.load_tab_file(filename).next() nfields = len(first)-2 if nfields > 1: return [l[0].strip() if l[0].find('=')==-1 else l[0].split('=')[0].strip() for l in ut.load_tab_file(ut.pre_ext(filename,'_names'))] else: return None #means there is only one data column.
def load_elution(fname, getname=True): # expected file structure: # first col: gene id # second col: treat differently if 2nd col header is 'Total' or # 'Description' # remaining cols: elution profile data lines = [l for l in ut.load_tab_file(fname)] # final row: total count in msblender output; don't skip in cuihong's data skip_final_row = lines[-1][0][0] == "#" rows = lines[1:-1] if skip_final_row else lines[1:] fractions = [f for f in lines[0][1:]] if fractions[0].lower() in ["total", "totalcount", "description"]: start_data_col = 2 fractions.remove(fractions[0]) else: start_data_col = 1 mat = np.matrix([row[start_data_col:] for row in rows], dtype="float32") prots = [row[0] for row in rows] elut = Struct(mat=mat, prots=prots, fractions=fractions, filename=fname, filename_original=fname) if start_data_col == 2: col2name_vals = [row[1] for row in rows] elut.column2vals = col2name_vals if getname: elut.name = os.path.basename(fname).split(".")[0] return elut
def load_elution(fname, getname=True): # expected file structure: # first col: gene id # second col: treat differently if 2nd col header is 'Total' or # 'Description' # remaining cols: elution profile data lines = [l for l in ut.load_tab_file(fname)] # final row: total count in msblender output; don't skip in cuihong's data skip_final_row = (lines[-1][0][0] == '#') rows = lines[1:-1] if skip_final_row else lines[1:] fractions = [f for f in lines[0][1:]] if fractions[0].lower() in ['total', 'totalcount', 'description']: start_data_col = 2 fractions.remove(fractions[0]) else: start_data_col = 1 mat = np.matrix([row[start_data_col:] for row in rows],dtype='float32') prots = [row[0] for row in rows] elut = Struct(mat=mat, prots=prots, fractions=fractions, filename=fname, filename_original=fname) if start_data_col == 2: col2name_vals = [row[1] for row in rows] elut.column2vals = col2name_vals if getname: elut.name = os.path.basename(fname).split('.')[0] return elut
def parse_msb_psms(fname): item1s = (line[0] for line in ut.load_tab_file(fname)) # ex: WAN110811_HCW_HEK293NE_P1D08.01387.2.SGNLTEDDKHNNAK item1s.next() # skip 1 line spect_pep = ('_'.join([sample,spect,pep]) for sample,spect,_,pep in (i1.split('.') for i1 in item1s)) return set(spect_pep)
def munge_malov(fdata): # load from proper columns cxs = {} for line in ut.load_tab_file(fdata): g,c = line[:2] cxs.setdefault(c,set([])).add(g) # remove (many) singletons for c,gset in cxs.items(): if len(gset) < 2: del cxs[c] ints = pd.PairDict([]) # interpret "approved"/"provisional"/"temporary" def scorec(c): if c[0] == 'A': return 10 elif c[0] == 'P': return 3 elif c[0] == 'T': return 1 else: print c[0] return 1 for c,gset in cxs.items(): score = scorec(c) for pair in it.combinations(gset,2): assert not ints.contains(pair), "ints contains %s" % pair[0]+pair[1] ints.append(pair, score) return ints
def load_weka_filtered_tpairs(fname, min_score=None): tested_pairs = [('','',r[0],true_to_1(r[1])) for r in ut.load_tab_file(fname)] tested_pairs.sort(key=lambda x:x[2], reverse=True) if min_score is not None: tested_pairs = [t for t in tested_pairs if float(t[2])>=min_score] return tested_pairs
def load_pep2prots(filename, sep='|'): """ Separator is '|' for most of andrew's files, but '&' for Nv and Xl. """ pep2prots = dict(((line[0], set([p.split()[0] for p in line[1].split(sep)])) for line in ut.load_tab_file(filename))) print "First 10 peptide mappings:", pep2prots.items()[:10] return pep2prots
def load_corum(fname, filter_methods, do_dedupe): """ Returns a list of tuples: (name, set(uniprotIDs), species) """ lines = [l[:7] for l in ut.load_tab_file(fname, sep=';')][1:] cxs = [(name, set(prots.split(',')), species, method) for _,name,_,species,prots,_,method in lines] if filter_methods: print "Filtering corum methods." keep_methods = set([x[0] for x in (ut.load_tab_file(ut.proj_path('corum_methods'))) if int(x[3])==1]) cxs = [(n,p,s) for n,p,s,methods in cxs if (len([m for m in methods.split('|') if m.split('-')[0].strip() in keep_methods]) > 0)] else: cxs = [(n,p,s) for n,p,s,m in cxs] return cxs
def elut_p2g(fname, p2g, suffix='_fix'): lines = ut.load_tab_file(fname) def process(lines): for items in lines: if items[0][0] != '#': yield [p2g[items[0]]] + list(items[1:]) else: yield items ut.write_tab_file(process(lines), fname+suffix)
def munge_original(fdata, column_inds, fnames, fout, first_names=1): """ Keep selected columns, replace 'NA' with '?', remove empty rows. Ids (first 2 columns) are kept automatically. For column inds, start with 0 for scores. Keep the same columns from the fnames file so I have a record of it. """ out = [] default = ['?'] * len(column_inds) for l in ut.load_tab_file(fdata): ids = list(l[:2]) newdata = [l[i+2] if l[i+2]!='NA' else '?' for i in range(len(l)) if i in column_inds] if newdata != default: out.append(ids + newdata) ut.write_tab_file(out, fout) names = [l for i,l in enumerate( list( ut.load_tab_file( fnames))[first_names:]) if i in column_inds] ut.write_tab_file(names, ut.pre_ext(fout, '_names'))
def load_complexes_multiline(filename): """ (Usually for 'clean' complexes). Load complexes in a file in the style of supp table 3: complexid, complexname, singlemember. """ filename = os.path.expanduser(filename) complexes = {} for l in ut.load_tab_file(filename): complexes.setdefault(l[1],set([])).add(l[2]) return complexes
def mq2elut(fname, quant='iBAQ'): lines = [l for l in ut.load_tab_file(fname)] # want eg 'iBAQ WAN...', not 'iBAQ L WAN...' inds = [i for i,val in enumerate(lines[0]) if re.match('^%s\s\w{2}' % quant,val) is not None] #prots = [[p.split()[0][1:] for p in ps.split(';')] #for ps in [l[0] for l in lines[1:]]] # for now just using the "majority protein" prots = [p.split()[0][1:] for p in [l[1] for l in lines[1:]]] output = [[lines[0][0]] + [lines[0][i] for i in inds]] + \ [[p] + [l[i] for i in inds] for p,l in zip(prots, lines[1:])] ut.write_tab_file(output, ut.pre_ext(fname, '_mq_%s' % quant))
def orth_pid2geneid(fname, p2g): lines = ut.load_tab_file(fname) def process(lines): def replistp2g(pclist): return ' '.join([el if i%2 else p2g[el] for i,el in enumerate(pclist)]) for n,items in enumerate(lines): if n==1: yield items else: newitems = list(items[:2]) for i in 2,3: newitems.append(replistp2g(items[i].split())) yield newitems ut.write_tab_file(process(lines), fname+'_fix')
def load_complexes_singleline(filename, startindex=1): """ (Usually for 'ppi' overlapping complexes) """ # load corum-type file into a dictionary # complexes: dict{complexid: set([protein1, protein2,...]), .. } # first col: complex id filename = os.path.expanduser(filename) complexes = {} # PPI complex set from traver has duplicate complex names with different # members. Using this approach means all the members from any lines with # that complex's name get added. for l in ut.load_tab_file(filename): for i in l[startindex:]: complexes.setdefault(l[0],set([])).add(i) return complexes
def score_arr_ext(arr, species, ext_key): """ Key_or_data: either a string matching one of the keys for ext data in config.py, or a tuple of (name,data) where data is a sequence of (id1, id2, score), and the sequence can be a generator. fnet_cols: list of columns or first 2 letters to include, eg ['HS','CE'] """ ext_file = ut.config()[ext_key] conv_dict = convdict_from_fname(species, ext_file) filename = ut.proj_path('fnet_path', ext_file) stored_names = fnet_names(ext_file) # None if only one data column. names = stored_names if stored_names else [ext_key] data_dict = load_net(ut.load_tab_file(filename)) print 'External data file: %s; size: %s; cols: %s' % (ext_file, len(data_dict), len(names)) score_arr(arr, species, names, data_dict, conv_dict)
def load_ogroups(from_sp, to_sp, fname=None): """ Load an inparanoid table.Sp1-Sp2 file into a list of orthogroups, where each orthogroup is a tuple containing 1) a list of proteins in sp1 and 2) a list of proteins in sp2. Eg: [([HsProtA, HsProtB,..],[CeProtA,CeProtB,..]), ([..],[..]), ...] """ # Skip header row; protein ids alternate with meaningless conf scores in # columns 2 and 3 in the order of the filename if fname is None: fname, swap_order = orth_fname(from_sp, to_sp) else: fname, swap_order = fname, False (from_ind, to_ind) = (2,3) if not swap_order else (3,2) ogroups = [([p for p in row[from_ind].split()[::2]],[p for p in row[to_ind].split()[::2]]) for row in ut.load_tab_file(fname)][1:] return ogroups
def load_ogroups(from_sp, to_sp, fname=None): """ Load an inparanoid table.Sp1-Sp2 file into a list of orthogroups, where each orthogroup is a tuple containing 1) a list of proteins in sp1 and 2) a list of proteins in sp2. Eg: [([HsProtA, HsProtB,..],[CeProtA,CeProtB,..]), ([..],[..]), ...] """ # Skip header row; protein ids alternate with meaningless conf scores in # columns 2 and 3 in the order of the filename if fname is None: fname, swap_order = orth_fname(from_sp, to_sp) else: fname, swap_order = fname, False (from_ind, to_ind) = (2, 3) if not swap_order else (3, 2) ogroups = [([p for p in row[from_ind].split()[::2]], [p for p in row[to_ind].split()[::2]]) for row in ut.load_tab_file(fname)][1:] return ogroups
def transpose(d, fin, fout): sys.path.append(d+'/..') import utils as ut lines = [l for l in ut.load_tab_file(fin)] if lines[-1][0].startswith('#'): #ignore comments, such as last line in spcount output lines = lines[:-1] print "skipping last line" cols = ut.zip_exact(*lines) #zip messes up if these files aren't neat # _After_ zipping, get rid of the column 1 header--R doesn't like it. col0list = list(cols[0]) print col0list[0][0] assert (col0list[0][0] == '#' or col0list[0] == 'Locus') # make sure we're removing what we should be col0list.remove(col0list[0]) cols[0] = tuple(col0list) col2title = cols[1][0].lower() # get rid of the total/descr column if col2title.find('total') > -1 or col2title.find('descr') > -1: cols.remove(cols[1]) print "removing second column--extraneous" ut.write_tab_file(cols, fout)
def pairs(fname): return [list(e) for e in ut.load_tab_file(fname)]
from __future__ import division import os from os.path import abspath import sys sys.path.append(os.path.dirname(abspath(__file__))+'/../') import utils as ut def move(fname, fmap): """ For renaming a file based on a mapping old_fname to new_fname. NOT for moving a file to mapped folder. That's the other script. """ basename = ut.shortname(fname) fext = os.path.splitext(fname)[1] fdir = os.path.split(fname)[0] if basename in fmap: newname = os.path.join(fdir,fmap[basename] + fext) print "moving", fname, newname os.rename(fname, newname) else: print "not found", fname if __name__ == '__main__': if len(sys.argv) < 2: sys.exit("usage: python blah.py mapfile.txt filename(s)") fname_map = sys.argv[1] filenames = sys.argv[2:] fmap = dict(ut.load_tab_file(fname_map)) for f in filenames: move(f, fmap)
def keep_unique_lines_by_column(fnames, column=0): lines_dict = collect_dict((line for f in fnames for line in ut.load_tab_file(f))) return (random.choice(list(lines)) for key,lines in lines_dict.items())
print "File not found:", fpath return basename = ut.shortname(fpath) if remove_final_underscore: basename = ('_'.join(basename.split('_')[:3]) if len(basename.split('_'))>2 else basename) if not basename in file2folder: print "No mapping for file:", fpath, basename return folder = file2folder[basename] if not os.path.exists(folder): print "Creating directory", folder os.mkdir(folder) newpath = os.path.join(folder, os.path.split(fpath)[1]) if os.path.exists(newpath): print "File exists:", newpath else: print "Moving to", newpath os.rename(fpath, newpath) if __name__ == '__main__': if len(sys.argv) < 2: sys.exit("usage: python blah.py files2folders.txt remove_final_underscore{0,1} filename(s)") fname_map = sys.argv[1] remove_final_underscore = int(sys.argv[2]) print "Remove final underscore", "yes" if remove_final_underscore else "no" filenames = sys.argv[3:] files2folders = dict(ut.load_tab_file(fname_map)) for f in filenames: maybe_move(f, files2folders, remove_final_underscore)