def indexing(self): # ---- Parameter for Indexing ---- BinIndex = libConfig.config() BinIndex.queryStr = "binHISAT2-BUILD" BinIndex.folderStr = "config/" BinIndex.modeStr = "UPDATE" BinIndex.load() # ---- Initialization for Indexing ---- self.commandStr = BinIndex.storeDict["command"] Target = libConfig.config() Target.queryStr = self.titleStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() self.folderStr = Target.storeDict["checkFolder"] self.seqPathStr = Target.storeDict["seqPath"] self.indexHeaderStr = Target.storeDict["indexHeader"] self.threadStr = Target.storeDict["thread"] if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True # ---- Action ---- pathlib.Path(self.folderStr).mkdir(parents=True, exist_ok=True) Print = libPrint.timer() Print.logFilenameStr = "02-hisat2-index-{title}".format( title=self.titleStr) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() infoDict = { 'seqPath': self.seqPathStr, 'indexHeader': self.indexHeaderStr, 'thread': self.threadStr, } CommandStr = self.commandStr.format(**infoDict) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
for branchStr in branchList: infoDict = { "branch": branchStr, "method": methodStr, "annotate": annotateStr, "trim": trimStr, "omic": omicStr, "note": noteStr } sourcePathStr = sourceFilePathStr.format(**infoDict) resultPathStr = resultFilePathStr.format(**infoDict) logFolderPath = logFolderPathStr.format(**infoDict) logPathStr = logFilePathStr.format(**infoDict) pathlib.Path(logFolderPath).mkdir(parents=True, exist_ok=True) Print = libPrint.timer() Print.logFilenameStr = logPathStr Print.folderStr = logFolderPath Print.testingBool = False Print.startLog() Print.printing( "branch = {branch}\nmethod = {method}\nannotate = {annotate}\ntrim = {trim}\ntype = {omic}" .format(**infoDict)) Print.printing("[SQL-load] open expression database") Connect = sqlite3.connect(sourcePathStr) Cursor = Connect.cursor() selectStr = "SELECT * FROM Expression" expExc = Cursor.execute(selectStr) Print.printing("[Compare] adjust combination")
def trimming(self): # ---- Parameter ---- BinTrim = libConfig.config() BinTrim.queryStr = "binTrimmomatic" BinTrim.folderStr = "config/" BinTrim.modeStr = "UPDATE" BinTrim.load() ExpRep = libConfig.config() ExpRep.queryStr = self.queryStr ExpRep.folderStr = "config/" ExpRep.modeStr = "UPDATE" ExpRep.load() # ---- Initialization ---- commandStr = BinTrim.storeDict["command"] conditionList = ExpRep.storeDict.get("conditionsList", []) groupList = ExpRep.storeDict.get("group", []) replicationList = ExpRep.storeDict.get("replication", []) directionList = ExpRep.storeDict.get("direction", []) branchStr = ExpRep.storeDict.get("branch", "") pairStr = ExpRep.storeDict.get("pairPostfix", "") unpairStr = ExpRep.storeDict.get("unpairPostfix", "") modeStr = ExpRep.storeDict.get("mode", "") inputFileNameStr = ExpRep.storeDict.get("[trim]inputFileName", "") outputFileNameStr = ExpRep.storeDict.get("[trim]outputFileName", "") fileTypeStr = ExpRep.storeDict.get("[trim]fileType", "") checkFolderList = ExpRep.storeDict.get("checkFolder", []) if not ExpRep.storeDict.get("testing", True): testingBool = False else: testingBool = True # ---- Action ---- for folderStr in checkFolderList: pathlib.Path(folderStr).mkdir(parents=True, exist_ok=True) if type(conditionList) == type(list()) and conditionList != []: for conditionDict in conditionList: conditionStr = conditionDict['trim'] Print = libPrint.timer() Print.logFilenameStr = "03-trim-{branch}-{cond}".format( branch=branchStr, cond=conditionStr) Print.folderStr = "log/" Print.testingBool = testingBool Print.startLog() TrimPara = libConfig.config() TrimPara.queryStr = conditionStr TrimPara.folderStr = "config/" TrimPara.modeStr = "UPDATE" TrimPara.load() headerStr = TrimPara.storeDict.get('header', "") for groupStr in groupList: for replicationStr in replicationList: if modeStr == "pairEnd": inputFileList = list() outputFileList = list() for directionStr in directionList: inputStr = inputFileNameStr.format( group=groupStr, replication=replicationStr, direction=directionStr, fileType=fileTypeStr) inputFileList.append(inputStr) outputPairStr = outputFileNameStr.format( condition=headerStr, direction=directionStr, group=groupStr, replication=replicationStr, pairType=pairStr, fileType=fileTypeStr, ) outputFileList.append(outputPairStr) outputUnPairStr = outputFileNameStr.format( condition=headerStr, direction=directionStr, group=groupStr, replication=replicationStr, pairType=unpairStr, fileType=fileTypeStr, ) outputFileList.append(outputUnPairStr) fileList = inputFileList + outputFileList fileStr = " ".join(fileList) commandDict = dict() commandDict.update(TrimPara.storeDict) commandDict.update({ 'files': fileStr, 'mode': "PE", }) CommandStr = commandStr.format(**commandDict) Print.phraseStr = CommandStr Print.runCommand() elif modeStr == "singleEnd": inputStr = inputFileNameStr.format( group=groupStr, replication=replicationStr, fileType=fileTypeStr) outputStr = outputFileNameStr.format( condition=headerStr, group=groupStr, replication=replicationStr, fileType=fileTypeStr, ) fileStr = "{} {}".format(inputStr, outputStr) commandDict = dict() commandDict.update(TrimPara.storeDict) commandDict.update({ 'files': fileStr, 'mode': "SE", }) CommandStr = commandStr.format(**commandDict) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def aligning(self): BinHISAT2 = libConfig.config() BinHISAT2.queryStr = "binHISAT2-RUN" BinHISAT2.folderStr = "config/" BinHISAT2.modeStr = "UPDATE" BinHISAT2.load() SAMconvert = libConfig.config() SAMconvert.queryStr = "binSAMtools-CONVERT" SAMconvert.folderStr = "config/" SAMconvert.modeStr = "UPDATE" SAMconvert.load() SAMsort = libConfig.config() SAMsort.queryStr = "binSAMtools-SORT" SAMsort.folderStr = "config/" SAMsort.modeStr = "UPDATE" SAMsort.load() Remove = libConfig.config() Remove.queryStr = "commandRM" Remove.folderStr = "config/" Remove.modeStr = "UPDATE" Remove.load() expRep = libConfig.config() expRep.queryStr = self.queryStr expRep.folderStr = "config/" expRep.modeStr = "UPDATE" expRep.load() branchStr = expRep.storeDict["branch"] pairPostfixStr = expRep.storeDict["pairPostfix"] unpairPostfixStr = expRep.storeDict["unpairPostfix"] groupList = expRep.storeDict["group"] modeStr = expRep.storeDict["mode"] replicationList = expRep.storeDict["replication"] conditionList = expRep.storeDict["conditionsList"] for conditionDict in conditionList: annotateConditionStr = conditionDict["genome"] trimConditionStr = conditionDict["trim"] hisat2ConditionStr = conditionDict["map"] directionDict = expRep.storeDict["[hisat2]direction"] fileTypeStr = expRep.storeDict["[trim]fileType"] inputFileNameStr = expRep.storeDict["[hisat2]inputFileName"] outputFolderStr = expRep.storeDict["[hisat2]outputFolder"] outputFileNameStr = expRep.storeDict["[hisat2]outputFileName"] if not expRep.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True finalOutputFolderStr = outputFolderStr.format( annotate=annotateConditionStr, trim=trimConditionStr) pathlib.Path(finalOutputFolderStr).mkdir(parents=True, exist_ok=True) Print = libPrint.timer() Print.logFilenameStr = "04-hs1-hisat2-{branch}-{hisat2cond}-{annotateCon}-{trimCon}".format( branch=branchStr, hisat2cond=hisat2ConditionStr, annotateCon=annotateConditionStr, trimCon=trimConditionStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for replicationStr in replicationList: finalDict = dict() Para = libConfig.config() #parameters Para.queryStr = hisat2ConditionStr Para.folderStr = "config/" Para.modeStr = "UPDATE" Para.load() finalDict.update(Para.storeDict) Spec = libConfig.config() #parameters Spec.queryStr = annotateConditionStr Spec.folderStr = "config/" Spec.modeStr = "UPDATE" Spec.load() finalDict.update( {"indexHeader": Spec.storeDict["indexHeader"]}) if modeStr == "pairEnd": pairForwardDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['1'], "pairType": pairPostfixStr, "fileType": fileTypeStr, } pairReverseDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['2'], "pairType": pairPostfixStr, "fileType": fileTypeStr, } unpairForwardDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['1'], "pairType": unpairPostfixStr, "fileType": fileTypeStr, } unpairReverseDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['2'], "pairType": unpairPostfixStr, "fileType": fileTypeStr, } elif modeStr == "singleEnd": unpairDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "fileType": fileTypeStr, } samDict = { "annotate": annotateConditionStr, "trim": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": ".sam", } samFileStr = outputFileNameStr.format(**samDict) bamDict = { "annotate": annotateConditionStr, "trim": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": ".bam", } bamFileStr = outputFileNameStr.format(**bamDict) sortedBAMDict = { "annotate": annotateConditionStr, "trim": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": "-sorted.bam", } sortedBAMFileStr = outputFileNameStr.format( **sortedBAMDict) if pathlib.Path(samFileStr).exists(): Print.phraseStr = "SAM File existed: " + samFileStr Print.printTimeStamp() elif not pathlib.Path(samFileStr).exists( ) and not pathlib.Path(bamFileStr).exists( ) and not pathlib.Path(sortedBAMFileStr).exists(): if modeStr == "pairEnd": commandStr = BinHISAT2.storeDict.get( "command-PE", "") finalDict.update({ "pairForwardFASTQ": inputFileNameStr.format(**pairForwardDict), "pairReverseFASTQ": inputFileNameStr.format(**pairReverseDict), "unpairForwardFASTQ": inputFileNameStr.format(**unpairForwardDict), "unpairReverseFASTQ": inputFileNameStr.format(**unpairReverseDict), "outputSAM": samFileStr }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() elif modeStr == "singleEnd": commandStr = BinHISAT2.storeDict.get( "command-SE", "") finalDict.update({ "unpairFASTQ": inputFileNameStr.format(**unpairDict), "outputSAM": samFileStr }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(bamFileStr).exists(): Print.phraseStr = "BAM File existed: " + bamFileStr Print.printTimeStamp() elif not pathlib.Path(bamFileStr).exists( ) and not pathlib.Path(sortedBAMFileStr).exists(): commandStr = SAMconvert.storeDict.get("command", "") finalDict.update({ "outputBAM": bamFileStr, "inputSAM": samFileStr, }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(samFileStr).exists() and pathlib.Path( bamFileStr).exists(): commandStr = Remove.storeDict.get("command", "") finalCommandStr = commandStr.format(target=samFileStr) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(sortedBAMFileStr).exists(): Print.phraseStr = "Sorted BAM File existed: " + sortedBAMFileStr Print.printTimeStamp() else: commandStr = SAMsort.storeDict.get("command", "") finalDict.update({ "outputBAM": sortedBAMFileStr, "inputBAM": bamFileStr, }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(bamFileStr).exists() and pathlib.Path( sortedBAMFileStr).exists(): commandStr = Remove.storeDict.get("command", "") finalCommandStr = commandStr.format(target=bamFileStr) Print.phraseStr = finalCommandStr Print.runCommand() Print.stopLog()
def summaring(self): FLAGstat = libConfig.config() FLAGstat.queryStr = "binSAMtools-FLAGSTAT" FLAGstat.folderStr = "config/" FLAGstat.modeStr = "UPDATE" FLAGstat.load() expRep = libConfig.config() expRep.queryStr = self.branchStr expRep.folderStr = "config/" expRep.modeStr = "UPDATE" expRep.load() trimConditionList = expRep.storeDict.get("[trim]condition", []) hisat2ConditionList = expRep.storeDict.get("[hisat2]Condition", []) annotateConditionList = expRep.storeDict.get("conditionList", []) groupList = expRep.storeDict.get("group", []) replicationList = expRep.storeDict.get("replication", []) outputFolderStr = expRep.storeDict.get("[hisat2]outputFolder", "") outputFileNameStr = expRep.storeDict.get("[hisat2]outputFileName", "") if not expRep.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True for trimConditionStr in trimConditionList: for conditionList in annotateConditionList: annotateConditionStr = conditionList[0] finalOutputFolderStr = outputFolderStr.format( annotateCondition=annotateConditionStr, trimCondition=trimConditionStr) pathlib.Path(finalOutputFolderStr).mkdir(parents=True, exist_ok=True) for hisat2ConditionStr in hisat2ConditionList: Print = libPrint.timer() Print.logFilenameStr = "04-hs2-hisat2-{hisat2cond}-{annotateCon}-{trimCon}".format( hisat2cond=hisat2ConditionStr, annotateCon=annotateConditionStr, trimCon=trimConditionStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for replicationStr in replicationList: sortedBAMDict = { "annotateCondition": annotateConditionStr, "trimCondition": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": "-sorted.bam", } sortedBAMFileStr = outputFileNameStr.format( **sortedBAMDict) if pathlib.Path(sortedBAMFileStr).exists(): commandStr = FLAGstat.storeDict.get( "command", "") finalCommandStr = commandStr.format( BAMfile=sortedBAMFileStr) Print.phraseStr = finalCommandStr Print.runCommand() Print.stopLog()
def estimating(self): # ---- Parameter for Assembling ---- BinMap = libConfig.config() BinMap.queryStr = "binStringTie-ESTIMATE" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() commandStr = BinMap.storeDict["command"] # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() branchStr = Target.storeDict.get("branch", "") groupList = Target.storeDict.get("group", []) replicationList = Target.storeDict.get("replication", []) hisat2ConditionStr = Target.storeDict.get("[hisat2]Condition", "") conditionList = Target.storeDict.get("conditionList", []) inputFileNameStr = Target.storeDict.get( "[{}]inputFileName".format(self.headerStr), "") mergedFileNameStr = Target.storeDict.get( "[{}]mergedFileName".format(self.headerStr), "") balgownFolderStr = Target.storeDict.get( "[{}]ballgownFolder".format(self.headerStr), "") gtfFileNameStr = Target.storeDict.get( "[{}]gtfFileName".format(self.headerStr), "") tsvFileNameStr = Target.storeDict.get( "[{}]tsvFileName".format(self.headerStr), "") if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True for conditionTup in conditionList: antCondStr = conditionTup[0] trimCondStr = conditionTup[1] Annotate = libConfig.config() Annotate.queryStr = antCondStr Annotate.folderStr = "config/" Annotate.modeStr = "UPDATE" Annotate.load() threadStr = Annotate.storeDict.get("thread", "") antPathStr = Annotate.storeDict.get("antPath", "") # ---- Action ---- Print = libPrint.timer() Print.logFilenameStr = "05-{stringtie}-estimating-{branch}-{annotate}-{trim}".format( stringtie=self.headerStr, branch=branchStr, annotate=antCondStr, trim=trimCondStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for repliStr in replicationList: ballgownPathStr = balgownFolderStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) pathlib.Path(ballgownPathStr).mkdir(parents=True, exist_ok=True) bamPathStr = inputFileNameStr.format( annotateCondition=antCondStr, hisat2Condition=hisat2ConditionStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) mergeFileNameStr = mergedFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr) gtfPathStr = gtfFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) tsvPathStr = tsvFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) if self.directEstimating: CommandStr = commandStr.format( thread=threadStr, mergePath=antPathStr, bamfile=bamPathStr, ballgownPath=ballgownPathStr, gtffile=gtfPathStr, tsvfile=tsvPathStr) else: CommandStr = commandStr.format( thread=threadStr, mergePath=mergeFileNameStr, bamfile=bamPathStr, ballgownPath=ballgownPathStr, gtffile=gtfPathStr, tsvfile=tsvPathStr) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def importingStringtie(self): # ---- Initialization for Converting ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() branchStr = self.branchStr controlStr = Target.storeDict.get("controlSample", "") controlSafeStr = controlStr.replace("-", "_") groupList = Target.storeDict.get("group", []) replicationList = Target.storeDict.get("replication", []) patternStr = Target.storeDict.get("samplePattern", "") sampleList = list() for groupStr in groupList: for replicationStr in replicationList: sampleList.append( patternStr.format(group=groupStr, replication=replicationStr)) conditionList = Target.storeDict.get("conditionList", []) methodList = Target.storeDict.get("methodList", []) geneSourceDict = Target.storeDict.get("[sqlite]geneSourceDict", dict()) transcriptSourceDict = Target.storeDict.get( "[sqlite]transcriptSourceDict", dict()) geneExpPathStr = Target.storeDict.get("[sqlite]geneSourcePathStr", "") transcriptExpPathStr = Target.storeDict.get( "[sqlite]transcriptSourcePathStr", "") sqlFolderStr = Target.storeDict.get("sqlFolderStr", "") sqlPathStr = Target.storeDict.get("sqlPathStr", "") sqlLogStr = Target.storeDict.get("[sqlite]logFilename", "") for methodStr in methodList: pathlib.Path( sqlFolderStr.format(branch=branchStr, method=methodStr)).mkdir(parents=True, exist_ok=True) geneFolderStr = geneSourceDict.get(methodStr, "") transcriptFolderStr = transcriptSourceDict.get(methodStr, "") compareSet = set() for conditionTup in conditionList: antStr = conditionTup[0] trimStr = conditionTup[1] Print = libPrint.timer() Print.logFilenameStr = sqlLogStr.format(ant=antStr, trim=trimStr) Print.folderStr = sqlFolderStr.format(branch=branchStr, method=methodStr) Print.testingBool = self.testingBool Print.startLog() for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") Print.phraseStr = "-- Data format conversion for Gene Expression in {} --".format( sampleStr) Print.printTimeStamp() geneSamplePath = geneExpPathStr.format( folder=geneFolderStr, branch=branchStr, ant=antStr, trim=trimStr, sample=sampleStr) sampleDF = pd.read_csv(geneSamplePath, delimiter="\t", header=0) Print.printing("[Pandas:Read]" + geneSamplePath) rowList = sampleDF.values.tolist() countInt = len(rowList) # check compareList = list() for rowInt in range(countInt): insertList = [] sourceList = rowList[rowInt] insertList.append("UUID." + str(rowInt)) insertList.extend(sourceList[0:7]) compareStr = "\t".join([str(x) for x in insertList]) compareList.append(compareStr) if compareSet == set(): Print.printing(" " + sampleStr + ": Empty") compareSet = set(compareList) elif compareSet != set(compareList): Print.printing(" " + sampleStr + ": Same") elif compareSet == set(compareList): Print.printing(" " + sampleStr + ": Different") sqlPath = sqlPathStr.format(branch=branchStr, method=methodStr, ant=antStr, trim=trimStr) createCommandStr = """CREATE TABLE GeneExpression_{} ('UUID' TEXT PRIMARY KEY NOT NULL, 'GeneID' TEXT NOT NULL, 'GeneName' TEXT NOT NULL, 'Reference' TEXT NOT NULL, 'Strand' TEXT NOT NULL, 'Start' INTEGER NOT NULL, 'End' INTEGER NOT NULL, 'Coverage' REAL NOT NULL, 'FPKM' REAL NOT NULL, 'TPM' REAL NOT NULL);""".format(sampleSafeStr) insertCommandStr = "INSERT INTO GeneExpression_{} ('UUID','GeneID','GeneName','Reference','Strand','Start','End','Coverage','FPKM','TPM')\ VALUES (?,?,?,?,?,?,?,?,?,?)".format(sampleSafeStr) self.expressionInputDict = { "sqlPath": sqlPath, "count": countInt, "rowList": rowList, "createCommand": createCommandStr, "insertCommand": insertCommandStr, } self.exportingExpression(Print) # Transcript Print.phraseStr = "-- Data format conversion for Transcript Expression in {} --".format( sampleStr) Print.printTimeStamp() transcriptSamplePath = transcriptExpPathStr.format( folder=transcriptFolderStr, branch=branchStr, ant=antStr, trim=trimStr, sample=sampleStr) sampleDF = pd.read_csv(transcriptSamplePath, delimiter="\t", header=0) Print.printing("[Pandas:Read] " + transcriptSamplePath) rowList = sampleDF.values.tolist() countInt = len(rowList) # check compareList = list() for rowInt in range(countInt): insertList = [] sourceList = rowList[rowInt] insertList.append("UUID." + str(rowInt)) insertList.extend(sourceList[0:10]) compareStr = "\t".join([str(x) for x in insertList]) compareList.append(compareStr) if compareSet == set(): Print.printing(" " + sampleStr + ": Empty") compareSet = set(compareList) elif compareSet != set(compareList): Print.printing(" " + sampleStr + ": Same") elif compareSet == set(compareList): Print.printing(" " + sampleStr + ": Different") sqlPath = sqlPathStr.format(branch=branchStr, method=methodStr, ant=antStr, trim=trimStr) createCommandStr = """CREATE TABLE TranscriptExpression_{} ('UUID' TEXT PRIMARY KEY NOT NULL, 'TranscriptID' INTEGER NOT NULL, 'Chromosome' TEXT, 'Strand' TEXT NOT NULL, 'Start' INTEGER NOT NULL, 'End' INTEGER NOT NULL, 'TranscriptName' TEXT NOT NULL, 'ExonCount' INTEGER NOT NULL, 'Length' INTEGER NOT NULL, 'GeneID' TEXT NOT NULL, 'GeneName' TEXT NOT NULL, 'Coverage' REAL NOT NULL, 'FPKM' REAL NOT NULL);""".format(sampleSafeStr) insertCommandStr = "INSERT INTO TranscriptExpression_{} ('UUID','TranscriptID','Chromosome','Strand','Start','End','TranscriptName','ExonCount','Length','GeneID','GeneName','Coverage','FPKM')\ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)".format( sampleSafeStr) self.expressionInputDict = { "sqlPath": sqlPath, "count": countInt, "rowList": rowList, "createCommand": createCommandStr, "insertCommand": insertCommandStr, } self.exportingExpression(Print) Print.phraseStr = "-- Summarising for Gene Expression --" Print.printTimeStamp() createComStr = "CREATE TABLE GeneExpressionSummary ({})" createColumnList = [ "'UUID' TEXT PRIMARY KEY NOT NULL", "'GeneID' TEXT", "'GeneName' TEXT", ] insertColumnList = ["UUID", "GeneID", "GeneName"] for targetStr in ["FPKM", "TPM"]: for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") columnStr = "{target}_{sample} REAL".format( target=targetStr, sample=sampleSafeStr) createColumnList.append(columnStr) insertColumnList.append("{target}_{sample}".format( target=targetStr, sample=sampleSafeStr)) Connect = sqlite3.connect(sqlPath) Cursor = Connect.cursor() ReturnMsg = Cursor.execute( createComStr.format(",".join(createColumnList))) # pylint: disable=unused-variable Connect.commit() Print.printing("[SQLite3:CreateTable] " + sqlPath) resultDict = dict() controlExc = Cursor.execute( "SELECT UUID, GeneID, GeneName from GeneExpression_{}". format(controlSafeStr)) for rowList in controlExc: uuid, geneid, genename = rowList subDict = { "UUID": uuid, "GeneID": geneid, "GeneName": genename } resultDict.update({uuid: subDict}) for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") sampleExc = Cursor.execute( "SELECT UUID, FPKM, TPM from GeneExpression_{}". format(sampleSafeStr)) for rowList in sampleExc: uuid, fpkm, tpm = rowList subDict = resultDict[uuid] subDict.update({ "FPKM_{}".format(sampleSafeStr): fpkm, "TPM_{}".format(sampleSafeStr): tpm }) resultDict.update({uuid: subDict}) insertComStr = "INSERT INTO GeneExpressionSummary ({column}) VALUES ({value})" for uuid in resultDict.keys(): valueList = list() for posInt in range(len(insertColumnList)): valueList.append( resultDict[uuid][insertColumnList[posInt]]) insertCommand = insertComStr.format( column=",".join(insertColumnList), value=(("?," * (len(valueList) - 1))) + "?") ReturnMsg = Cursor.execute(insertCommand, valueList) Connect.commit() Print.printing("[SQLite3:Insert] " + sqlPath) Connect.close() Print.printing("[SQLite3:Close]\n") # Transcript Print.phraseStr = "-- Summarising for Gene Expression --" Print.printTimeStamp() createComStr = "CREATE TABLE TranscriptExpressionSummary ({})" createColumnList = [ "'UUID' TEXT PRIMARY KEY NOT NULL", "'TranscriptID' INTEGER", "'TranscriptName' TEXT", "'GeneID' TEXT", "'GeneName' TEXT", ] insertColumnList = [ "UUID", "TranscriptID", "TranscriptName", "GeneID", "GeneName" ] for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") columnStr = "FPKM_{sample} REAL".format( sample=sampleSafeStr) createColumnList.append(columnStr) insertColumnList.append( "FPKM_{sample}".format(sample=sampleSafeStr)) Connect = sqlite3.connect(sqlPath) Cursor = Connect.cursor() ReturnMsg = Cursor.execute( createComStr.format(",".join(createColumnList))) Connect.commit() Print.printing("[SQLite3:CreateTable] " + sqlPath) resultDict = dict() controlExc = Cursor.execute( "SELECT UUID, TranscriptID, TranscriptName, GeneID, GeneName from TranscriptExpression_{}" .format(controlSafeStr)) for rowList in controlExc: uuid, tid, tname, geneid, genename = rowList subDict = { "UUID": uuid, "TranscriptID": tid, "TranscriptName": tname, "GeneID": geneid, "GeneName": genename } resultDict.update({uuid: subDict}) for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") sampleExc = Cursor.execute( "SELECT UUID, FPKM from TranscriptExpression_{}". format(sampleSafeStr)) for rowList in sampleExc: uuid, fpkm = rowList subDict = resultDict[uuid] subDict.update({"FPKM_{}".format(sampleSafeStr): fpkm}) resultDict.update({uuid: subDict}) insertComStr = "INSERT INTO TranscriptExpressionSummary ({column}) VALUES ({value})" for uuid in resultDict.keys(): valueList = list() for posInt in range(len(insertColumnList)): valueList.append( resultDict[uuid][insertColumnList[posInt]]) insertCommand = insertComStr.format( column=",".join(insertColumnList), value=(("?," * (len(valueList) - 1))) + "?") ReturnMsg = Cursor.execute(insertCommand, valueList) Connect.commit() Print.printing("[SQLite3:Insert] " + sqlPath) Connect.close() Print.printing("[SQLite3:Close]\n") Print.stopLog()
def assembling(self): # ---- Parameter for Assembling ---- if self.withoutAnnotation: BinMap = libConfig.config() BinMap.queryStr = "binStringTie-RUN-withoutAnnotation" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() else: BinMap = libConfig.config() BinMap.queryStr = "binStringTie-RUN" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() commandStr = BinMap.storeDict["command"] # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() branchStr = Target.storeDict.get("branch", "") groupList = Target.storeDict.get("group", []) replicationList = Target.storeDict.get("replication", []) hisat2ConditionStr = Target.storeDict.get("[hisat2]Condition", "") conditionList = Target.storeDict.get("conditionList", []) inputFileNameStr = Target.storeDict.get( "[{}]inputFileName".format(self.headerStr), "") outputFileNameStr = Target.storeDict.get( "[{}]outputFileName".format(self.headerStr), "") outputFolderStr = Target.storeDict.get( "[{}]outputFolder".format(self.headerStr), "") if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True for conditionTup in conditionList: antCondStr = conditionTup[0] trimCondStr = conditionTup[1] Annotate = libConfig.config() Annotate.queryStr = antCondStr Annotate.folderStr = "config/" Annotate.modeStr = "UPDATE" Annotate.load() threadStr = Annotate.storeDict.get("thread", "") antPathStr = Annotate.storeDict.get("antPath", "") # ---- Action ---- Print = libPrint.timer() Print.logFilenameStr = "05-{stringtie}-assembling-{branch}-{annotate}-{trim}".format( stringtie=self.headerStr, branch=branchStr, annotate=antCondStr, trim=trimCondStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for repliStr in replicationList: outputFolderStr = outputFolderStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr) pathlib.Path(outputFolderStr).mkdir(parents=True, exist_ok=True) outputFilenameStr = outputFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) inputFilenameStr = inputFileNameStr.format( annotateCondition=antCondStr, hisat2Condition=hisat2ConditionStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) CommandStr = commandStr.format( bamfile=inputFilenameStr, outputfile=outputFilenameStr, thread=threadStr, antPath=antPathStr) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def converting(self): # ---- Parameter ---- BinGFF = libConfig.config() BinGFF.queryStr = "binCufflinks-gffread" BinGFF.folderStr = "config/" BinGFF.modeStr = "UPDATE" BinGFF.load() Copying = libConfig.config() Copying.queryStr = "commandCP" Copying.folderStr = "config/" Copying.modeStr = "UPDATE" Copying.load() # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True gffreadStr = BinGFF.storeDict["command"] copyStr = Copying.storeDict["command"] branchStr = self.branchStr conditionsList = [ n for n in Target.storeDict["conditionsList"] if n["transcriptome"] == "gffRead" ] gtfDict = Target.storeDict["gtfDict"] for conditionDict in conditionsList: genomeStr = conditionDict["genome"] trimStr = conditionDict["trim"] transcriptomeStr = conditionDict["transcriptome"] folderStr = gtfDict[transcriptomeStr]['folder'] infoDict = { "branch": self.branchStr, "annotate": genomeStr, "trim": trimStr, "folder": folderStr, } targetFolderStr = Target.storeDict["transcriptomeFolder"] targetStr = Target.storeDict["transcriptomeGTF"] Spec = libConfig.config() #parameters Spec.queryStr = genomeStr Spec.folderStr = "config/" Spec.modeStr = "UPDATE" Spec.load() inputStr = Spec.storeDict["antPath"] outputStr = Spec.storeDict["gtfPath"] outputFolderStr = Spec.storeDict["dbgaPath"] Print = libPrint.timer() Print.logFilenameStr = "05-gffConversion-{branch}-{annotate}".format( branch=branchStr, annotate=genomeStr, ) Print.folderStr = outputFolderStr Print.testingBool = self.testingBool Print.startLog() targetPath = targetStr.format(**infoDict) if not pathlib.Path(outputStr).exists(): CommandStr = gffreadStr.format(inputFile=inputStr, outputFile=outputStr) Print.phraseStr = CommandStr Print.runCommand() folderPath = targetFolderStr.format(**infoDict) pathlib.Path(folderPath).mkdir(parents=True, exist_ok=True) CommandStr = copyStr.format(output=outputStr, target=targetPath) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def diffing(self): # ---- Parameter for Assembling ---- BinMap = libConfig.config() BinMap.queryStr = "binCuffDiff-RUN" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() commandStr = BinMap.storeDict["command"] # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() groupList = Target.storeDict["group"] replicationList = Target.storeDict["replication"] threadStr = Target.storeDict["thread"] bamFileNameStr = Target.storeDict["[hisat2]outputFileName"] gtfFileNameStr = Target.storeDict["transcriptomeGTF"] gtfDict = Target.storeDict["gtfDict"] resultFolderStr = Target.storeDict["[CuffDiff]resultFolder"] conditionsList = Target.storeDict["conditionsList"] for conditionDict in conditionsList: genomeStr = conditionDict["genome"] trimStr = conditionDict["trim"] transcriptomeStr = conditionDict["transcriptome"] folderStr = gtfDict[transcriptomeStr]['folder'] hisat2ConditionStr = conditionDict["map"] if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True infoDict = { "branch": self.branchStr, "method": transcriptomeStr, "annotate": genomeStr, "trim": trimStr, "folder": folderStr, "hisat2Condition": hisat2ConditionStr, "fileType": "-sorted.bam", } # ---- Action ---- Print = libPrint.timer() Print.logFilenameStr = "07-CuffDiff-{branch}-from({method})-{annotate}-{trim}".format( **infoDict) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() resultPathStr = resultFolderStr.format(**infoDict) pathlib.Path(resultPathStr).mkdir(parents=True, exist_ok=True) gtfFileStr = gtfFileNameStr.format(**infoDict) bamGroupList = list() for groupStr in groupList: bamFileList = list() for repliStr in replicationList: bamFileDict = dict() bamFileDict.update(infoDict) bamFileDict.update({ "group": groupStr, "replication": repliStr, }) bamFileStr = bamFileNameStr.format(**bamFileDict) bamFileList.append(bamFileStr) bamGroupList.append(",".join(bamFileList)) bamSampleStr = " ".join(bamGroupList) infoDict.update({ "thread": threadStr, "outputFolder": resultPathStr, "labelList": ",".join(groupList), "mergedGTF": gtfFileStr, "bamFiles": bamSampleStr, }) CommandStr = commandStr.format(**infoDict) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def converting(self): sourceFilesList = self.argumentDict.get("files", []) referColumnNameStr = self.argumentDict.get("refer_column", "") prefixStr = self.argumentDict.get("prefix", "") headerList = self.argumentDict.get("header", []) headlessBoo = self.argumentDict.get("headless", True) delimiterStr = self.argumentDict.get("delimiter", "\t") Print = libPrint.timer() Print.logFilenameStr = self.logFilenameStr Print.folderStr = self.folderStr Print.testingBool = self.testingBool Print.startLog() Print.phraseStr = "Total files: " + pprint.pformat(sourceFilesList) Print.printTimeStamp() # for sourceFilenameStr in sourceFilesList: # Print.phraseStr = "{Now processing} " + sourceFilenameStr Print.printTimeStamp() # tempFilenamelist = sourceFilenameStr.split(".") tempFilenamelist[-1] = "json" resultFilenameStr = ".".join(tempFilenamelist) resultFileBo = pathlib.Path(resultFilenameStr).exists() if not resultFileBo: # Print.phraseStr = "{Loading Files} " + sourceFilenameStr Print.printTimeStamp() # linesList = open(sourceFilenameStr).read().splitlines() firstLineBo = True maxDigitInt = len(linesList) lineNumInt = 0 while firstLineBo and lineNumInt < maxDigitInt: lineStr = linesList[0] if lineStr[0] == "#": del linesList[0] else: firstLineStr = linesList[0] firstLineBo = False lineNumInt = lineNumInt + 1 positionDict = dict() # {numbering:key} idKeyValueDict = dict() relationDict = { "{key:{value:[id]}}": dict(), "{key:{id:[value]}}": dict() } # Print.phraseStr = "{Create Header/Key List} " + sourceFilenameStr Print.printTimeStamp() # if not headlessBoo: if headerList == []: tempLineStr = linesList.pop(0) headerList = tempLineStr.split(delimiterStr) else: column_temp_list = firstLineStr.split(delimiterStr) maxDigitInt = len(str(len(column_temp_list))) for column_num in range(len(column_temp_list)): digit_num = len(str(len(column_num))) if digit_num != maxDigitInt: diff_digit_num = maxDigitInt - digit_num else: diff_digit_num = 0 # headerList.append("Column_" + ("0" * diff_digit_num) + str(column_num)) # Print.phraseStr = "{Assign Key's Position} " + sourceFilenameStr Print.printTimeStamp() # for number in range(len(headerList)): if headerList[number] not in positionDict.values(): positionDict.update({number: headerList[number]}) if headerList[number] not in relationDict.get( "{key:{value:[id]}}").keys(): relationDict.get("{key:{value:[id]}}").update( {headerList[number]: {}}) if headerList[number] not in relationDict.get( "{key:{id:[value]}}").keys(): relationDict.get("{key:{id:[value]}}").update( {headerList[number]: {}}) referColumnExistBoo = False if referColumnNameStr != "": referColumnExistBoo = True # Print.phraseStr = "{Start Conversion} " + sourceFilenameStr Print.printTimeStamp() # lineIdInt = 0 CurrentlineCountInt = 0 TotalLineCountInt = len(linesList) for lineStr in linesList: CurrentlineCountInt = CurrentlineCountInt + 1 if lineStr[0] != "#": # wordStr = "[" + str(CurrentlineCountInt) + "/" + str( TotalLineCountInt) + "]" print(wordStr, end="\r") # tempValueList = lineStr.split(delimiterStr) tempKeyValueDict = dict() idStr = "" # lineIdInt = lineIdInt + 1 if not referColumnExistBoo: idStr = prefixStr + str(lineIdInt) # if len(tempValueList) == len(positionDict.keys()): for number in range(len(list( positionDict.keys()))): keyStr = positionDict.get(number) tempKeyValueDict.update( {keyStr: tempValueList[number]}) if keyStr == referColumnNameStr and referColumnExistBoo: idStr = tempValueList[number] # if idStr != "": targetValueDict = idKeyValueDict.get( idStr, dict()) for keyStr in tempKeyValueDict.keys(): tempList = targetValueDict.get( keyStr, list()) tempList.append(tempKeyValueDict[keyStr]) targetValueDict.update( {keyStr: list(set(tempList))}) # idKeyValueDict.update({idStr: targetValueDict}) else: # print(lineStr) Print.phraseStr = "[{}/{}] Line without id".format( str(CurrentlineCountInt), str(TotalLineCountInt)) Print.printPhrase() else: print('line: ' + str(lineIdInt)) # Print.phraseStr = "{Rearrange Relation Dict.} " + sourceFilenameStr Print.printTimeStamp() # MakingRelation = relationGeneration() MakingRelation.inputDict = dict() MakingRelation.inputDict.update(idKeyValueDict) MakingRelation.logFilenameStr = self.logFilenameStr + "-relation" MakingRelation.generating() relationDict = MakingRelation.outputDict # with open(resultFilenameStr, "w") as result_file_handle: json.dump(idKeyValueDict, result_file_handle, indent=4, sort_keys=True) filenameStr = resultFilenameStr.replace( ".json", "-KeyValueIdDict.json") with open(filenameStr, "w") as relation_file_handle: json.dump(relationDict["{key:{value:[id]}}"], relation_file_handle, indent=4, sort_keys=True) filenameStr = resultFilenameStr.replace( ".json", "-KeyIdValueDict.json") with open(filenameStr, "w") as relation_file_handle: json.dump(relationDict["{key:{id:[value]}}"], relation_file_handle, indent=4, sort_keys=True) filenameStr = resultFilenameStr.replace( ".json", "-KeyMetadata.json") with open(filenameStr, "w") as relation_file_handle: json.dump(relationDict["metadata"], relation_file_handle, indent=4, sort_keys=True) Print.stopLog()
def generating(self): Print = libPrint.timer() Print.logFilenameStr = self.logFilenameStr Print.folderStr = self.folderStr Print.testingBool = self.testingBool Print.startLog() valueIdDict = dict() idValueDict = dict() metaDict = dict() for idStr in list(self.inputDict.keys()): keyValueDict = self.inputDict.get(idStr) for keyStr in list(keyValueDict.keys()): sourceValueList = keyValueDict.get(keyStr) for valueStr in sourceValueList: tempValueIdDict = valueIdDict.get(keyStr, {}) tempIdValueDict = idValueDict.get(keyStr, {}) valueList = tempIdValueDict.get(idStr, []) valueList.append(valueStr) tempIdValueDict.update({idStr: valueList}) idList = tempValueIdDict.get(valueStr, []) idList.append(idStr) tempValueIdDict.update({valueStr: idList}) valueIdDict.update({keyStr: tempValueIdDict}) idValueDict.update({keyStr: tempIdValueDict}) targetValueIdDict = dict() keyValueCountDict = dict() for keyStr in valueIdDict.keys(): targetValueDict = dict() valueIdCountDict = dict() for valueStr in valueIdDict[keyStr].keys(): idList = valueIdDict[keyStr][valueStr] targetSet = sorted(list(set(idList))) if targetSet != [""] and idList != []: targetValueDict.update({valueStr: targetSet}) # valueCountInt = valueIdCountDict.get(len(targetSet), 0) valueCountInt = valueCountInt + 1 valueIdCountDict.update({len(targetSet): valueCountInt}) targetValueIdDict.update({keyStr: targetValueDict}) keyValueCountDict.update({keyStr: valueIdCountDict}) targetIdValueDict = dict() keyIdCountDict = dict() for keyStr in idValueDict.keys(): targetIdDict = dict() idValueCountDict = dict() for idStr in idValueDict[keyStr].keys(): valueList = idValueDict[keyStr][idStr] targetSet = sorted(list(set(valueList))) if targetSet != [""] and valueList != []: targetIdDict.update({idStr: targetSet}) # idCountInt = idValueCountDict.get(len(targetSet), 0) idCountInt = idCountInt + 1 idValueCountDict.update({len(targetSet): idCountInt}) targetIdValueDict.update({keyStr: targetIdDict}) keyIdCountDict.update({keyStr: idValueCountDict}) metaDict.update({"count(id):valueAmount": keyValueCountDict}) metaDict.update({"count(value):idAmount": keyIdCountDict}) self.outputDict = { "{key:{value:[id]}}": targetValueIdDict, "{key:{id:[value]}}": targetIdValueDict, "metadata": metaDict, } Print.stopLog()