def run_spaan(inputFasta, outputDir, rawFlag): command = SPAAN_PATH rawOutput = os.path.join(outputDir, "SPAAN") if not os.path.exists(rawOutput): os.mkdir(rawOutput) sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") os.system( "%s %s %s" % (command, inputFasta, os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))) output = ['ID\tSPAAN_Score'] values = {} for row in open( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)).read().split('\n')[1:]: if row == '': continue tokens = row.split('\t') values[tokens[2].strip('>')] = tokens[1] for fastaID in sequenceIDs: if fastaID in values.keys(): output.append('\t'.join([fastaID, values[fastaID]])) else: output.append(fastaID + '\t0.0') open(os.path.join(rawOutput, "%s.spaan.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(output)) if not rawFlag: os.remove( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))
def run_psortb(inputFasta, outputDir, organism, multiFlag, process, rawFlag): command = PSORTB_PATH rawOutput = os.path.join(outputDir, "PSORTB") if not os.path.exists(rawOutput): os.mkdir(rawOutput) sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") for file in os.listdir(rawOutput): if 'psortb_grampos.txt' in file: os.remove(os.path.join(rawOutput, file)) if organism.lower() in ["gram+", "g+"]: os.system("%s -p -i %s" % (command, inputFasta)) for file in os.listdir(rawOutput): if 'psortb_grampos.txt' in file: rawFile = file break elif organism.lower() in ["gram-", "g-"]: os.system("%s -n -i %s" % (command, inputFasta)) for file in os.listdir(rawOutput): if 'psortb_gramneg.txt' in file: rawFile = file break output = [ '\t'.join([ "ID", "SubcellularLocation", "Extracellular_Probability", "CytoplasmicMembrane_Probability", "Cytoplasmic_Probability", "Cellwall_Probability", "Periplasmic_Probability", "OuterMembrane_Probability" ]) ] locs = [ "Extracellular", "CytoplasmicMembrane", "Cytoplasmic", "Cellwall", "Periplasmic", "OuterMembrane" ] values = {} for entry in re.split('[-]{79}', open(os.path.join(rawOutput, rawFile)).read())[:-1]: fastaID = entry.strip().split('\n')[0][7:].strip() value = [""] + ["0.0"] * (len(locs)) value[0] = re.split( '[ ]+', entry[entry.find("Final Prediction:"):].split('\n')[1])[1] for row in entry[entry.find("Localization Scores:"):entry. find("Final Prediction:")].split('\n')[1:-1]: tokens = re.split('[ ]+', row) value[locs.index(tokens[1]) + 1] = str(float(tokens[2]) / 10.0) values[fastaID] = value for fastaID in sequenceIDs: output.append('\t'.join([fastaID] + values[fastaID])) open(os.path.join(rawOutput, "%s.psortb.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(output)) if not rawFlag: os.remove(os.path.join(rawOutput, rawFile))
def split_files(inputFasta, tmpDir): fasta = readfasta.readFasta(inputFasta, key="full", strip=False) sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") inFiles = [] outFiles = [] split = int(math.ceil(len(sequenceIDs)) / SPLIT_LIMIT) size = int(math.ceil(len(sequenceIDs) / float(split + 1))) for i in range(split + 1): textBuffer = '' for fastaID in list(fasta.keys())[i * size:(i + 1) * size]: textBuffer = textBuffer + fastaID + fasta[fastaID] inFile = os.path.join(tmpDir, "input.fasta.%i" % i) open(inFile, 'w').write(textBuffer) inFiles.append(inFile) outFile = os.path.join(tmpDir, "output.raw.%i" % i) outFiles.append(outFile) return (inFiles, outFiles)
def run_immugen(inputFasta, outputDir, rawFlag): command = IMGEN_PATH rawOutput = os.path.join(outputDir, "IMGEN") if not os.path.exists(rawOutput): os.mkdir(rawOutput) fasta = readfasta.readFasta(inputFasta, key="full") sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") tmpInput = os.path.join(rawOutput, "sequence.tmp") open(tmpInput, 'w').write('') for seq in fasta.values(): open(tmpInput, 'a').write("%s\n" % seq) os.system( "python2.7 %s %s > %s" % (command, tmpInput, os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))) output = ['ID\tImmunogenicity_Score'] values = {} for row in open( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)).read().split('\n')[4:]: if len(row) == 0: continue tokens = row.split(',') seq = tokens[0] fastaID = list(fasta.keys())[list(fasta.values()).index(seq)] values[fastaID] = tokens[2] for fastaID in sequenceIDs: if fastaID in values.keys(): output.append('\t'.join([fastaID, values[fastaID]])) else: output.append(fastaID + '\t0.0') open(os.path.join(rawOutput, "%s.imgen.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(output)) if not rawFlag: os.remove( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)) os.remove(tmpInput)
def makeInput(self, inputFasta, outputDir, organism, incFeatures): featureDir = os.path.join(outputDir, "_FEATURE") if organism.lower() in ["gram+", "g+", "gram-", "g-"]: masterLabels = ["ID", "Gram"] masterData = {} sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") if organism.lower() in ["gram+", "g+"]: for fastaID in sequenceIDs: masterData[fastaID] = [fastaID, "1"] for method in incFeatures: tsvFile = os.path.join( featureDir, method.upper(), "%s.%s.tsv" % (Path(inputFasta).stem, method)) for (i, line) in enumerate(open(tsvFile).read().splitlines()): tokens = line.split('\t') if i == 0: if method == "psortb": masterLabels += tokens[2:] else: masterLabels += tokens[1:] else: if method == "psortb": masterData[tokens[0]] += tokens[2:] else: masterData[tokens[0]] += tokens[1:] elif organism.lower() in ["gram-", "g-"]: for fastaID in sequenceIDs: masterData[fastaID] = [fastaID, "0"] for method in incFeatures: tsvFile = os.path.join( featureDir, method.upper(), "%s.%s.tsv" % (Path(inputFasta).stem, method)) for (i, line) in enumerate(open(tsvFile).read().splitlines()): tokens = line.split('\t') if i == 0: if method == "psortb": masterLabels += tokens[2:] else: masterLabels += tokens[1:] else: if method == "psortb": masterData[tokens[0]] += tokens[2:] else: masterData[tokens[0]] += tokens[1:] else: masterLabels = ["ID"] masterData = {} sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") for fastaID in sequenceIDs: masterData[fastaID] = [fastaID] for method in incFeatures: tsvFile = os.path.join( featureDir, method.upper(), "%s.%s.tsv" % (Path(inputFasta).stem, method)) for (i, line) in enumerate(open(tsvFile).read().splitlines()): tokens = line.split('\t') if i == 0: if method == "psortb": masterLabels += tokens[2:] else: masterLabels += tokens[1:] else: if method == "psortb": masterData[tokens[0]] += tokens[2:] else: masterData[tokens[0]] += tokens[1:] output = ["\t".join(masterLabels)] for fastaID in masterData.keys(): output.append("\t".join(masterData[fastaID])) open(os.path.join(outputDir, "%s.input.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(output))
def run_descriptor(inputFasta, outputDir, rawFlag): rawOutput = os.path.join(outputDir, "MDESC") if not os.path.exists(rawOutput): os.mkdir(rawOutput) fasta = readfasta.readFasta(inputFasta, key="full") sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") aacomp = [] ctd = [] seqord = [] autocor = [] output = [] for fastaID in sequenceIDs: sequence = fasta[fastaID] Des = GetProDes(sequence) features = {} dict.update(features, Des.GetAAComp()) keys = list(features.keys()) if len(aacomp) == 0: aacomp.append('\t'.join(['ID'] + keys)) output.append('\t'.join(['ID'] + keys)) tmp = [] for key in keys: tmp.append(str(features[key])) aacomp.append('\t'.join([fastaID] + tmp)) output.append('\t'.join([fastaID] + tmp)) features = Des.GetCTD() keys = list(features.keys()) if len(ctd) == 0: ctd.append('\t'.join(['ID'] + keys)) output[0] += '\t%s' % '\t'.join(keys) tmp = [] for key in keys: tmp.append(str(features[key])) ctd.append('\t'.join([fastaID] + tmp)) output[-1] += '\t%s' % '\t'.join(tmp) features = {} dict.update(features, Des.GetQSO(maxlag=MIN_PEPTIDE_LENGTH)) keys = list(features.keys()) if len(seqord) == 0: seqord.append('\t'.join(['ID'] + keys)) output[0] += '\t%s' % '\t'.join(keys) tmp = [] for key in keys: tmp.append(str(features[key])) seqord.append('\t'.join([fastaID] + tmp)) output[-1] += '\t%s' % '\t'.join(tmp) features = {} dict.update(features, Des.GetMoreauBrotoAuto(maxlag=MIN_PEPTIDE_LENGTH)) dict.update(features, Des.GetGearyAuto(maxlag=MIN_PEPTIDE_LENGTH)) keys = list(features.keys()) if len(autocor) == 0: autocor.append('\t'.join(['ID'] + keys)) output[0] += '\t%s' % '\t'.join(keys) tmp = [] for key in keys: tmp.append(str(features[key])) autocor.append('\t'.join([fastaID] + tmp)) output[-1] += '\t%s' % '\t'.join(tmp) if rawFlag: open( os.path.join(rawOutput, "%s.aacomp.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(aacomp)) open(os.path.join(rawOutput, "%s.ctd.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(ctd)) open( os.path.join(rawOutput, "%s.seqord.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(seqord)) open( os.path.join(rawOutput, "%s.autocor.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(autocor)) open(os.path.join(rawOutput, "%s.mdesc.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(output)) for fastaID in sequenceIDs: sequence = fasta[fastaID]
def run_tmhmm(inputFasta, outputDir, multiFlag, process, rawFlag): command = TMHMM_PATH rawOutput = os.path.join(outputDir, "TMHMM") if not os.path.exists(rawOutput): os.mkdir(rawOutput) sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") if multiFlag.lower() in ['t', 'true' ] and len(sequenceIDs) > SPLIT_LIMIT: (inFiles, outFiles) = Feature.split_files(inputFasta, rawOutput) else: inFiles = [inputFasta] outFiles = [ os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem) ] clines = [] for (f, inFile) in enumerate(inFiles): outFile = outFiles[f] cline = "%s %s > %s" % (command, inFile, outFile) clines.append(cline) Feature.mp_run(clines, process) if multiFlag.lower() in ['t', 'true' ] and len(sequenceIDs) > SPLIT_LIMIT: Feature.combine_and_cleanup_files( inFiles, outFiles, os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)) fasta = readfasta.readFasta(inputFasta, key="full", strip=False) sequenceIDs = {} for fastaID in fasta.keys(): shortID = fastaID.strip().strip('>').split(' ')[0] sequenceIDs[shortID] = fastaID.strip().strip('>') values = {} for row in open( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)).read().split('\n'): if row.startswith('#'): tokens = row.split(' ') if tokens[1] not in values.keys(): values[tokens[1]] = [""] * 4 if "Number of predicted TMHs:" in row: values[tokens[1]][0] = tokens[-1].strip() if "Exp number of AAs in TMHs:" in row: values[tokens[1]][1] = tokens[-1].strip() if "Exp number, first 60 AAs:" in row: values[tokens[1]][2] = tokens[-1].strip() if "Total prob of N-in:" in row: values[tokens[1]][3] = tokens[-1].strip() output = [ "ID\tPredicted_TMH#\tExp_AAs#\tExp_first_60_AAs#\tTotal_N-in_prob" ] for shortID in sequenceIDs.keys(): if shortID in values.keys(): output.append('\t'.join([sequenceIDs[shortID]] + values[shortID])) else: output.append('\t'.join([sequenceIDs[shortID]] + [""] * 4)) open(os.path.join(rawOutput, "%s.tmhmm.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(output)) if not rawFlag: os.remove( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)) for tmpFile in glob.glob(os.path.join(os.getcwd(), "TMHMM_*")): shutil.rmtree(tmpFile)
def run_signalp(inputFasta, outputDir, organism, multiFlag, process, rawFlag): command = SIGNALP_PATH rawOutput = os.path.join(outputDir, "SIGNALP") if not os.path.exists(rawOutput): os.mkdir(rawOutput) sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") if multiFlag.lower() in ['t', 'true' ] and len(sequenceIDs) > SPLIT_LIMIT: (inFiles, outFiles) = Feature.split_files(inputFasta, rawOutput) else: inFiles = [inputFasta] outFiles = [ os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem) ] clines = [] for (f, inFile) in enumerate(inFiles): outFile = outFiles[f] if organism.lower() in ["gram+", "g+"]: cline = "%s -t gram+ %s > %s" % (command, inFile, outFile) elif organism.lower() in ["gram-", "g-"]: cline = "%s -t gram- %s > %s" % (command, inFile, outFile) clines.append(cline) Feature.mp_run(clines, process) if multiFlag.lower() in ['t', 'true' ] and len(sequenceIDs) > SPLIT_LIMIT: Feature.combine_and_cleanup_files( inFiles, outFiles, os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)) fasta = readfasta.readFasta(inputFasta, key="full", strip=False) sequenceIDs = {} for fastaID in fasta.keys(): shortID = fastaID.strip().strip('>').split(' ')[0] sequenceIDs[shortID] = fastaID.strip().strip('>') values = {} for (i, row) in enumerate( open( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem)).readlines()): if row == '' or row.startswith('#'): continue tokens = re.split('[ ]+', row) if tokens[0] in values.keys(): print("Duplicate SingalP Result: %s" % tokens[0]) values[tokens[0]] = tokens[8] output = ["ID\tSignalP_DScore"] for shortID in sequenceIDs.keys(): if shortID in values.keys(): output.append('\t'.join( [sequenceIDs[shortID], values[shortID]])) else: output.append('\t'.join([sequenceIDs[shortID], '0.000'])) open(os.path.join(rawOutput, "%s.signalp.tsv" % Path(inputFasta).stem), 'w').write('\n'.join(output)) if not rawFlag: os.remove( os.path.join(rawOutput, "%s.output" % Path(inputFasta).stem))
def train(self, positiveFasta, negativeFasta, outputDir, organism, incFeatures, multiFlag, process): featureDir = os.path.join(outputDir, "_FEATURE") masterLabels = ["ID", "Label", "Gram"] masterData = {} for (g, inputFasta) in enumerate([negativeFasta, positiveFasta]): sequenceIDs = readfasta.readFastaDesc(inputFasta, key="full") if organism.lower() in ["gram+", "g+"]: for fastaID in sequenceIDs: masterData[fastaID] = [fastaID, str(g), "1"] for method in incFeatures: tsvFile = os.path.join( featureDir, method.upper(), "%s.%s.tsv" % (Path(inputFasta).stem, method)) for (i, line) in enumerate(open(tsvFile).read().splitlines()): tokens = line.split('\t') if i == 0: if method == "psortb": masterLabels += tokens[2:] else: masterLabels += tokens[1:] else: if method == "psortb": masterData[tokens[0]] += tokens[2:] else: masterData[tokens[0]] += tokens[1:] elif organism.lower() in ["gram-", "g-"]: for fastaID in sequenceIDs: masterData[fastaID] = [fastaID, str(g), "0"] for method in incFeatures: tsvFile = os.path.join( featureDir, method.upper(), "%s.%s.tsv" % (Path(inputFasta).stem, method)) for (i, line) in enumerate(open(tsvFile).read().splitlines()): tokens = line.split('\t') if i == 0: if method == "psortb": masterLabels += tokens[2:] else: masterLabels += tokens[1:] else: if method == "psortb": masterData[tokens[0]] += tokens[2:] else: masterData[tokens[0]] += tokens[1:] rows = [] samples = [] groups = [] for fastaID in masterData.keys(): tokens = masterData[fastaID] samples.append(fastaID) groups.append(int(tokens[1])) rows.append(tokens[2:]) X = np.array(rows) X = X.astype(float) y = np.array(groups) scaler = MinMaxScaler(copy=False) scaler.fit(X) joblib.dump(scaler, os.path.join(outputDir, "Scaler.sav")) X = scaler.transform(X) est = XGBClassifier(objective='binary:logistic', silent=True, nthread=1, eval_metric='auc', random_state=26) estPipe = Pipeline([('feature_selection', SelectKBest(mRMR)), ('classification', est)]) grid = [{ "feature_selection__k": list(range(20, X.shape[1], 20))[:10], 'classification__learning_rate': [0.3, 0.1], 'classification__n_estimators': [60, 80, 100, 120, 140, 160], 'classification__max_depth': [3, 6, 9], 'classification__min_child_weight': [1, 3], 'classification__scale_pos_weight': [1, 6], 'classification__max_delta_step': [0, 3], }] cv = StratifiedShuffleSplit(n_splits=3, random_state=6) if multiFlag.lower() not in ['t', 'true']: xgb = GridSearchCV(estimator=estPipe, param_grid=grid, cv=cv, iid=False, verbose=1, n_jobs=1) else: xgb = GridSearchCV(estimator=estPipe, param_grid=grid, cv=cv, iid=False, verbose=1, n_jobs=process) xgb.fit(X, y) joblib.dump(xgb, os.path.join(outputDir, "VaxignML.sav")) y_prob = xgb.predict_proba(X) joblib.dump(y_prob, os.path.join(outputDir, "VaxignML.scores"))