def getFile(path, logger=None, tmp=None): ensureTmpIsPresent(tmp, logger=logger) logDebug(logger, "Getting file {}".format(path)) isValidFile = os.path.isfile(path) if isValidFile: logDebug(logger, "Path is a valid file.") return path else: logDebug(logger, "Path is not a valid file path") isValidUrl = True try: validateUrl(path) except: isValidUrl = False if isValidUrl: logDebug(logger, "Path seems like a fine url. Try to download the file") downloadPath = os.path.join(tmp, "download.fasta") logInfo(logger, "Downloading file {} from remote source".format(path)) urllib.request.urlretrieve(path, downloadPath) return downloadPath else: logError( logger, "The given path {} is not a valid url nor points to any local file." .format(path)) exit(1)
def makeHMMScanRequest( seq, seqID=None, logger=None, hmmUrl="https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan", queryParams={}, queryHeaders={}): if seqID: logDebug(logger, "Doing HMMScan on seqence \"{}\"...".format(seqID)) defaultParams = { "hmmdb": (None, "pfam"), "seq": (None, seq), "threshold": (None, "cut_ga") } defaultHeaders = { "Accept": "text/xml", "User-Agent": "Mozilla/5.0", } s = requests.Session() r = requests.Request('POST', hmmUrl, files={ **defaultParams, **queryParams }, headers={ **defaultHeaders, **queryHeaders }).prepare() answer = s.send(r) answerContent = answer.content.decode("utf-8") if seqID: logDebug(logger, "Done HMMScan on seqence \"{}\"".format(seqID)) # Replace invalid tags if answerContent: answerContent = re.sub(r'<\d+ H=.*\/>', '', answerContent) xmlTree = None try: xmlTree = ET.fromstring(answerContent) except: logError( logger, "Failed HMMScan on sequence \"{}\" - Invalid response ".format( seqID)) logError(logger, "xml [{}]".format(answerContent)) raise Exception("XML Parse failed") if seqID: return {"seqID": seqID, "result": xmlTree} return xmlTree
def ensureTmpIsPresent(tmp, logger=None): if not os.path.exists(tmp): os.makedirs(tmp) else: logInfo(logger, "Clearing temporary download directory") for the_file in os.listdir(tmp): file_path = os.path.join(tmp, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: logError(logger, e)
def main(logginglevel, input, tmp): logger = createLogger(__file__) logger = setLoggerLevel(logger, logginglevel) inputCount = len(input) if inputCount != 2: logError( logger, "Invalid files number was specified. Please provide exactly two input files." ) exit(1) inputA = input[0] inputB = input[1] logInfo(logger, "Fisher test of {} compared to {}".format(inputA, inputB)) tableB = loadMatrixFromCSV(inputA, logger=logger) tableA = loadMatrixFromCSV(inputB, logger=logger) (domainNames, tables) = normalizeTables([tableA, tableB], logger=logger) domainPsCustom = [ round(v, 2) for v in customFisher(domainNames, tables[0], tables[1], logger=logger) ] domainPsFisher = [ round(v, 2) for v in scipyFisher(domainNames, tables[0], tables[1], logger=logger) ] results = [[r[0], r[1], r[2]] for r in zip(domainNames, domainPsCustom, domainPsFisher)] results.sort(key=lambda x: x[1]) resultsT = [[row[i] for row in results] for i in range(3)] table = go.Table( header=dict(values=['Domain', 'Custom Fisher', 'SciPy Fisher']), cells=dict(values=resultsT)) data = [table, bars] py.offline.plot({"data": data}, auto_open=False)
def entrezRetrieveSequence( accessionIDs, tmp='.', entrezUrl='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', dbName='protein', retries=5): if len(accessionIDs) <= 0: return [] downloadedFilePath = os.path.join( tmp, "etrez.{}.{}.fasta".format( accessionIDs[0] + "_batch__" + str(len(accessionIDs)), dbName)) retryNo = 0 while retryNo < retries: retryNo = retryNo + 1 try: urllib.request.urlretrieve( '{}?db={}&id={}&rettype=fasta&retmode=text'.format( entrezUrl, dbName, ",".join(accessionIDs)), downloadedFilePath) break except urllib.error.HTTPError as err: if retryNo < retries - 1: logError(logger, "Request failed retrying in 5 seconds...") time.sleep(5) else: raise err ret = [] with open(downloadedFilePath, "r") as handle: for record in SeqIO.parse(handle, "fasta"): ret.append(record) return ret
def main(logginglevel, input, tmp, idthreshold, evalue): logger = createLogger(__file__) logger = setLoggerLevel(logger, logginglevel) with ThreadPoolExecutor(max_workers=10) as executor: loop = asyncio.get_event_loop() inputFiles = [ getFile(inputFile, logger=logger, tmp=tmp) for inputFile in input ] outputFiles = [] for file in inputFiles: fileOut = file.replace(".fasta", "_ext.fasta") logInfo(logger, "Extending file {} to {}".format(file, fileOut)) seqs = [] with open(file, "r") as handle: for record in SeqIO.parse(handle, "fasta"): seqs.append("> {}\n{}".format(record.id, str(record.seq))) queryStr = "\n".join(seqs) logInfo(logger, "Requesting web BLASTP search") requestOut = qblast("blastp", "nr_v5", queryStr, expect=float(evalue), perc_ident=float(idthreshold)).getvalue() root = ET.fromstring(requestOut) accessionIDs = [] hits = root.findall( "./BlastOutput_iterations/Iteration/Iteration_hits/Hit") logInfo(logger, "Got {} hits for given query".format(len(hits))) if len(hits) <= 0: logInfo(logger, requestOut) raise Exception('No hits found') recordsOut = [] recordCountOut = 0 for hit in hits: seqs = hit.findall("./Hit_hsps/Hsp/Hsp_qseq") for seq in seqs: accessionID = hit.find("./Hit_accession").text accessionIDs.append(accessionID) gatherTasks = [] requestsLimit = 3 chunkSize = int(len(accessionIDs) / requestsLimit) + 1 accessionChunks = [ chunk for chunk in chunks(accessionIDs, chunkSize) ] logDebug( logger, "Chunked request, will request Entrez for {} batches of size {}." .format(len(accessionChunks), chunkSize)) for accessionIDs in accessionChunks: gatherTasks.append( loop.run_in_executor(executor, entrezRetrieveSequence, *((accessionIDs, tmp)))) gatherFuture = asyncio.ensure_future(asyncio.gather(*gatherTasks)) loop.run_until_complete(gatherFuture) recordsOut = [] for records in gatherFuture.result(): for record in records: recordsOut.append(record.format("fasta")) recordCountOut = len(recordsOut) logDebug(logger, "Writing to file {}".format(fileOut)) with open(fileOut, "w") as fileOutHandle: fileOutHandle.write("\n".join(recordsOut)) with open(fileOut, "r") as handle: recordCount = 0 for record in SeqIO.parse(handle, "fasta"): recordCount = recordCount + 1 if recordCount != recordCountOut: logError( logger, "Got mismatch records count after writing output file. {} records are present in {} and there should be {} reconds." .format(recordCount, fileOut, recordCountOut)) outputFiles.append(fileOut) logInfo( logger, "Written {} files in total. Returning them to stdout as plaintext paths list" .format(len(outputFiles))) print("\n".join(outputFiles))