Пример #1
0
    print("#################################################################")
    print("#            Welcome in extractColfromList (Version " + version +
          ")              #")
    print("#################################################################")
    print('Start time: ', start_time, '\n')

    #get arguments
    tableFile = relativeToAbsolutePath(args.tableFile)
    tableFileOut = args.tableFileOut
    IDlist = args.IDlist

    if tableFileOut == "":
        tableFileOut = tableFile.split(".")[0] + "_extractedIDs.tab"

    #loading IDs to be kept in a list
    listNameKeep = loadInList(IDlist)

    if ".gz" in tableFile:
        fichier = gzip.open(tableFile, "rb")
    else:
        fichier = open(tableFile, "rb")

    #loading column IDs in a list
    header = fichier.readline().decode("utf-8").rstrip().split("\t")

    indice = 0
    listIndiceKeep = []
    for colName in header:
        if colName in listNameKeep:
            #print(colName, indice)
            #get position for each ID to be kept, +1 for cut command (sh col1 = python col0)
Пример #2
0
    gffFile = relativeToAbsolutePath(args.gffFile)
    listKeepFile = relativeToAbsolutePath(args.listKeepFile)
    tabFile = relativeToAbsolutePath(args.tabFile)
    fastaPath = args.fastaPath
    pathFileOut = args.pathOut

    #fastaFile = relativeToAbsolutePath(args.fastaFile)

    print("\t - Input GFF is: %s" % gffFile)
    print("\t - Input listKeppFile is: %s" % listKeepFile)
    print("\t - Input tabFile is: %s" % tabFile)
    print("\t - Input fasta files is: %s" % fastaPath.pathDirectory)
    print("\t - Output fasta files is: %s" % pathFileOut.pathDirectory)

    listKeepID = [
        ID.replace("Mycfi_gene", "gene_") for ID in loadInList(listKeepFile)
    ]
    print("\nTotal ID Keep: %d" % len(listKeepID))

    objGFF = parseGFF(gffFile)
    recordCount = 0
    dicoGenesKeepPosOnScaff = {}
    keepValidList = []
    geneIDsens = {}

    for record in objGFF.parseGFF3():

        if record.type == "mRNA":
            transcriptID = record.attributes["transcriptId"]
            geneID = "gene_" + record.attributes["transcriptId"]
Пример #3
0
    print("\t - Chromosome file is: %s" % chromosomeFile)
    print("\t - List of gene file is: %s" % listGeneFile)
    print("\t - GFF file is : %s" % gffFile)
    print("\t - UTRlen is : %s" % UTRlen)
    print("\t - UTRchoice is : %s" % UTRchoice)
    print("\t - TAG is : %s" % args.idTag)

    print(" - Output Info:")
    print("\t - Output file with gene+UTR is:  %s" % outFile)

    # chargement du fasta des chromosome en mémoire
    sequencesChrom = Fasta(chromosomeFile)
    #print(sequencesChrom.keys())

    # chargement de la list des gènes
    keepGeneList = loadInList(listGeneFile)
    print("\t - There is %s genes" % len(keepGeneList))

    # ouverture du fichier de sortie:
    with open(outFile, "w") as outputFile:
        # parse GFF pour avoir les positions
        objGFF = parseGFF(gffFile)
        for record in objGFF.parseGFF3():
            chromosome = record.seqid
            if record.type == "gene":
                try:
                    geneName = record.attributes[args.idTag]
                #print(geneName)
                except Exception as e:
                    pass
    outputfilePath = args.paramoutfile
    mggFileKeep = relativeToAbsolutePath(args.mggFileKeep)

    print("\t - Path with fasta is: %s" % fastaFile.pathDirectory)
    print("\t - Path with corresponding Orthologues is  : %s" %
          listFile.pathDirectory)
    print("\t - MGG list keep are in file: %s\n" % mggFileKeep)

    print("\t - Output Orthologues fasta is: %s\n\n" % outputfilePath)

    #recupération de la liste des CDS complet
    listCDSfiles = fastaFile.lsExtInDirToList(["fasta", "fas", "fa"])
    print("\n".join(listCDSfiles))

    #ouverture de la liste des MGG à garder
    mggKeepall = loadInList(mggFileKeep)

    # trie de la list pour supprimer les T1 et T2 avec T0
    MGGWithoutT, mggKeep, toRM = [], [], []

    for mgg in mggKeepall:
        mggNoT = mgg.replace("T0", "").replace("T1", "").replace("T2", "")
        if mggNoT not in MGGWithoutT:
            MGGWithoutT.append(mggNoT)
            mggKeep.append(mgg)
        else:
            toRM.append(mgg)

    with open("List" + str(len(toRM)) + "TranscriptAlternatifstoRM.txt",
              "w") as toRMFile:
        txt = "\n".join(toRM)