def prepare_run(self, log, info): workdir = info[Keys.WORKDIR] iprophoutput = info['PEPCSV'] alleles = pso.fixallele(info['ALLELE_LIST']) exe = info['NETMHCCONS'] self.df = pd.read_csv(iprophoutput,sep="\t", header=0) uniqueSeq = self.df.drop_duplicates( subset=['search_hit'] ) pepseq = uniqueSeq['search_hit'].tolist() self.outfiles, commands = pso.prepare_for_NetMhcCons(workdir, pepseq, alleles, exe) return info, commands
def prepare_run(self, log, info): workdir = info[Keys.WORKDIR] iprophoutput = info['PEPCSV'] exe = info['NETMHCCONS'] dbpath = info['DB_PATH'] self.db = pd.read_csv(dbpath, sep=",", header=0) self.iprophet = pd.read_csv(iprophoutput, sep="\t", header=0) filesiprophet = list(self.iprophet['spectrum']) filesiprophet = map(lambda w: w.split(".")[0], filesiprophet) self.iprophet['fileid'] = filesiprophet filesdb = list(self.db['FileName']) filesdb = map(lambda w: os.path.splitext(w)[0], filesdb) self.db['fileid'] = filesdb self.mergefiles = self.iprophet.merge(self.db, left_on='fileid', right_on='fileid', how='inner') sampleIDs = self.mergefiles['SampleID'].unique() self.outfiles = [] self.commands = [] for sampleID in sampleIDs: allSet = self.mergefiles[self.mergefiles['SampleID'] == sampleID] alleles = (allSet['MHCAllele'].unique()).tolist() if (len(alleles) != 1): log.warning("there is more than one allele : {}".format('-'.join(alleles))) pepseq = (allSet['search_hit'].unique()).tolist() alleles = alleles[0].split(',') tmpdir = os.path.join(workdir, sampleID) if not os.path.exists(tmpdir): os.makedirs(tmpdir) print tmpdir outfiles, commands = pso.prepare_for_NetMhcCons(tmpdir, pepseq, alleles, exe) self.outfiles += outfiles self.commands += commands return info, self.commands