print("#################################################################") print("# Welcome in extractColfromList (Version " + version + ") #") print("#################################################################") print('Start time: ', start_time, '\n') #get arguments tableFile = relativeToAbsolutePath(args.tableFile) tableFileOut = args.tableFileOut IDlist = args.IDlist if tableFileOut == "": tableFileOut = tableFile.split(".")[0] + "_extractedIDs.tab" #loading IDs to be kept in a list listNameKeep = loadInList(IDlist) if ".gz" in tableFile: fichier = gzip.open(tableFile, "rb") else: fichier = open(tableFile, "rb") #loading column IDs in a list header = fichier.readline().decode("utf-8").rstrip().split("\t") indice = 0 listIndiceKeep = [] for colName in header: if colName in listNameKeep: #print(colName, indice) #get position for each ID to be kept, +1 for cut command (sh col1 = python col0)
gffFile = relativeToAbsolutePath(args.gffFile) listKeepFile = relativeToAbsolutePath(args.listKeepFile) tabFile = relativeToAbsolutePath(args.tabFile) fastaPath = args.fastaPath pathFileOut = args.pathOut #fastaFile = relativeToAbsolutePath(args.fastaFile) print("\t - Input GFF is: %s" % gffFile) print("\t - Input listKeppFile is: %s" % listKeepFile) print("\t - Input tabFile is: %s" % tabFile) print("\t - Input fasta files is: %s" % fastaPath.pathDirectory) print("\t - Output fasta files is: %s" % pathFileOut.pathDirectory) listKeepID = [ ID.replace("Mycfi_gene", "gene_") for ID in loadInList(listKeepFile) ] print("\nTotal ID Keep: %d" % len(listKeepID)) objGFF = parseGFF(gffFile) recordCount = 0 dicoGenesKeepPosOnScaff = {} keepValidList = [] geneIDsens = {} for record in objGFF.parseGFF3(): if record.type == "mRNA": transcriptID = record.attributes["transcriptId"] geneID = "gene_" + record.attributes["transcriptId"]
print("\t - Chromosome file is: %s" % chromosomeFile) print("\t - List of gene file is: %s" % listGeneFile) print("\t - GFF file is : %s" % gffFile) print("\t - UTRlen is : %s" % UTRlen) print("\t - UTRchoice is : %s" % UTRchoice) print("\t - TAG is : %s" % args.idTag) print(" - Output Info:") print("\t - Output file with gene+UTR is: %s" % outFile) # chargement du fasta des chromosome en mémoire sequencesChrom = Fasta(chromosomeFile) #print(sequencesChrom.keys()) # chargement de la list des gènes keepGeneList = loadInList(listGeneFile) print("\t - There is %s genes" % len(keepGeneList)) # ouverture du fichier de sortie: with open(outFile, "w") as outputFile: # parse GFF pour avoir les positions objGFF = parseGFF(gffFile) for record in objGFF.parseGFF3(): chromosome = record.seqid if record.type == "gene": try: geneName = record.attributes[args.idTag] #print(geneName) except Exception as e: pass
outputfilePath = args.paramoutfile mggFileKeep = relativeToAbsolutePath(args.mggFileKeep) print("\t - Path with fasta is: %s" % fastaFile.pathDirectory) print("\t - Path with corresponding Orthologues is : %s" % listFile.pathDirectory) print("\t - MGG list keep are in file: %s\n" % mggFileKeep) print("\t - Output Orthologues fasta is: %s\n\n" % outputfilePath) #recupération de la liste des CDS complet listCDSfiles = fastaFile.lsExtInDirToList(["fasta", "fas", "fa"]) print("\n".join(listCDSfiles)) #ouverture de la liste des MGG à garder mggKeepall = loadInList(mggFileKeep) # trie de la list pour supprimer les T1 et T2 avec T0 MGGWithoutT, mggKeep, toRM = [], [], [] for mgg in mggKeepall: mggNoT = mgg.replace("T0", "").replace("T1", "").replace("T2", "") if mggNoT not in MGGWithoutT: MGGWithoutT.append(mggNoT) mggKeep.append(mgg) else: toRM.append(mgg) with open("List" + str(len(toRM)) + "TranscriptAlternatifstoRM.txt", "w") as toRMFile: txt = "\n".join(toRM)