Exemplo n.º 1
0
def printArgs(parsedArgs):
    allArgsStr = "All Args \n"
    allArgsStr += "#" * 25 + "\n"
    allArgsStr += f"Simulation Type {parsedArgs.simulation_type}" + "\n"
    allArgsStr += "Input Genomes Files :" + "\n"
    indLevel = "\t"
    for inFile in parsedArgs.genomes:
        allArgsStr += indLevel + "* " + os.path.abspath(inFile) + "\n"

    allArgsStr += "Species Prefix/Name and Id:" + "\n"
    for speciesPrefix in parsedArgs.speciesPrefix:
        allArgsStr += f"{indLevel} * {speciesPrefix}:{parsedArgs.speciesIds[speciesPrefix]}" + "\n"
    if parsedArgs.simulation_type == "RNA":
        allArgsStr += "Input Annotations Files :" + "\n"
        indLevel = "\t"
        for inFile in parsedArgs.annotations:
            allArgsStr += indLevel + "* " + os.path.abspath(inFile) + "\n"
        allArgsStr += "Generated Transciptome Files :" + "\n"
        indLevel = "\t"
        for inFile in parsedArgs.fasta_names:
            allArgsStr += indLevel + "* " + os.path.abspath(inFile) + "\n"
    allArgsStr += f"Read Length : {parsedArgs.input_rlen}" + "\n"
    allArgsStr += f"Read Layout : {parsedArgs.read_layout}" + "\n"
    allArgsStr += "#" * 25 + "\n"

    getLogger().debug(allArgsStr)
    ## TODO :: complete the rest here

    return
Exemplo n.º 2
0
def bwaIndex(parsedArgs):
    logger = getLogger()
    algo = ""
        #calcualte concat.fasta genome size
    genome_len=0
    for rec in SeqIO.parse(parsedArgs.genomeConcatFasta, 'fasta'):
        genome_len+=len(rec.seq)
        
    if genome_len > 3000000000:
        logger.info("Concatenated genome size is larged than 3GB. Using bwtsw algorithm for index generation" )
        algo = "-a bwtsw"
    
    logger.info("Starting genome indexing with BWA.")
    if os.path.isdir(f"{parsedArgs.out_dir}/BWA_index") == True:
        logger.info("BWA_index directory exists. Generating index files.")
    else:
        logger.info(f"Creating {parsedArgs.out_dir}/BWA_index directory. Writing index files to BWA_index.")
        os.makedirs(f"{parsedArgs.out_dir}/BWA_index")
        
    cmd_bwa_index = "bwa index " \
    f"-p {parsedArgs.out_dir}/BWA_index/concat_BWA " \
    f"{algo} " \
    f"{parsedArgs.genomeConcatFasta}"

    # print(cmd_bwa_index)
    res = crossmapper.externalExec.execute(cmd_bwa_index,"BWA_index", outDir = f"{parsedArgs.out_dir}")
    if not res.resCheck( stdoutRemove = True ):
        sys.exit("Execution Failed")
    logger.info("Genome index for BWA is generated.")
Exemplo n.º 3
0
 def __init__(self , parsedArgs, mapperName = "mapper" ):
     self.parsedArgs = parsedArgs
     self.logger = getLogger()
     self.mapperName = mapperName
     self.sorted = False
     self.indexFolder = f"{parsedArgs.out_dir}/{mapperName}_index"
     self.mappingDir = f"{parsedArgs.out_dir}/{mapperName}_output"
Exemplo n.º 4
0
def starIndex(parsedArgs):
    logger = getLogger()
    #calcualte concat.fasta genome size
    genome_len=0
    for rec in SeqIO.parse(parsedArgs.genomeConcatFasta, 'fasta'):
        genome_len+=len(rec.seq)
        
    if genome_len > 3000000000:
        logger.warning("Concatenated genome size is larged than 3 Gb! More than 30 GB of RAM will be required for STAR mapping." )
    
    SA_index_size = min(14, round(math.log(genome_len,2)/2) - 1)
    logger.debug("genomeSAindexNbases = %s"%(SA_index_size))
    logger.info("Starting genome indexing with STAR.")
    
    star_index = f"{parsedArgs.out_dir}/STAR_index"
    
    if os.path.isdir(f"{star_index}") == True:
        logger.info("STAR_index directory exists. Generating index files.")
    else:
        logger.info(f"Creating {star_index} directory. Writing index files to STAR_index.")
        os.makedirs(f"{star_index}")
        
        
        
    cmd_star_index = "STAR --runMode genomeGenerate " \
    f"--runThreadN {parsedArgs.threads} " \
    f"--genomeDir {star_index} " \
    f"--genomeFastaFiles {parsedArgs.genomeConcatFasta} " \
    f"--genomeSAindexNbases {SA_index_size}"
    
    res = crossmapper.externalExec.execute(cmd_star_index,"STAR_index" , outDir = f"{parsedArgs.out_dir}" )
    if not res.resCheck( stdoutRemove = True ):
        sys.exit("Execution Failed")
    logger.info("Genome index for STAR is generated.")
    parsedArgs.starIndex = star_index
Exemplo n.º 5
0
def execute(cmd,
            softName="extr_cmd",
            stdOutFile=None,
            stdErrFile=None,
            outDir="",
            overwrite=True):
    mode = "w"
    if not overwrite:
        mode = 'a'
    logger = getLogger()
    logger.debug(f"Start Running {softName} CMD : {cmd}")
    ## remove white spaces
    cmd = cmd.strip()
    cmd_list = cmd.split(" ")  ## split as list
    cmd_list = list(filter(None, cmd_list))
    if stdOutFile == None:
        stdOutFile = os.path.join(outDir, softName + "_stdout.txt")

    if stdErrFile == None:
        stdErrFile = os.path.join(outDir, softName + "_stderr.txt")

    with open(stdOutFile, mode) as outfile, open(stdErrFile,
                                                 mode) as errorfile:
        try:
            if not overwrite:
                outfile.write("#" * 25 + "\n")
                errorfile.write("#" * 25 + "\n")
                outfile.write(f"{cmd}\n" + "#" * 25 + "\n")
                errorfile.write(f"{cmd}\n" + "#" * 25 + "\n")
                outfile.flush()
                errorfile.flush()
            process = subprocess.run(cmd_list,
                                     shell=False,
                                     stdout=outfile,
                                     stderr=errorfile,
                                     check=False)
            logger.debug("Running CMD Return Code : " +
                         str(process.returncode))

            if process.returncode != 0:
                logger.error(f"Can not excecute {softName} CMD : \"{cmd}\".")
            return ExecRes(cmd, process, softName, stdOutFile, stdErrFile)
        except FileNotFoundError as no_file:

            logger.error("Error in execute CMD : NO SUCH FILE OR DIRECTORY. " +
                         str(no_file),
                         exc_info=True)
            raise Exception("Can not execute CMD, " + str(no_file))
        except PermissionError as perm_denied:
            logger.error("PERMISSION DENIED, " + str(perm_denied),
                         exc_info=True)
            raise Exception("Error in execute CMD, " + str(perm_denied))

        except Exception as ex:
            logger.error("Error in execute CMD: " + str(ex),
                         exc_info=True)  #raise ex
            raise Exception("Error in execute CMD, " + str(ex))

        return ExecRes(cmd, None, softName, stdOutFile, stdErrFile)
Exemplo n.º 6
0
 def resCheck(self, clean=True, stdoutRemove=False, stdErrRemove=True):
     logger = getLogger()
     if self.returnCode != 0:
         logger.error(f"Error running {self.cmd}")
         logger.error(f"See error log in {self.stdErrFile}")
         return False
     if clean:
         self.clean(stdoutRemove=stdoutRemove, stdErrRemove=stdErrRemove)
     return True
Exemplo n.º 7
0
 def clean(self, stdoutRemove=False, stdErrRemove=True):
     logger = getLogger()
     if stdoutRemove:
         logger.debug(f"Deleteing {self.stdOutFile}")
         os.remove(self.stdOutFile)
     if stdErrRemove:
         logger.debug(f"Deleteing {self.stdErrFile}")
         os.remove(self.stdErrFile)
     return
Exemplo n.º 8
0
def bwaMapping(parsedArgs,reads,rlen,read_layout):
    logger = getLogger()
    logger.info("Starting mapping with BWA.")
#    cmd_bwa_mapping = "bwa mem " \
#    f"-t {parsedArgs.threads} concat {reads} -a | " \
#    f"samtools sort @{parsedArgs.threads} -o concat_{rlen}_{read_layout}_sorted.bam -"
#    print(cmd_bwa_mapping)
    
    bwa_dir = f"{parsedArgs.out_dir}/bwa_output"
    parsedArgs.mappingDir = bwa_dir
    if os.path.isdir(f"{bwa_dir}") == False:
        logger.info(f"Creating {bwa_dir} directory.")
        os.makedirs(f"{bwa_dir}")
    
    
    
    
    tmpSamFile = f"{bwa_dir}/concat_{rlen}_{read_layout}.sam"
    finalBamFile = f"{bwa_dir}/concat_{rlen}_{read_layout}_sorted.bam"
    cmd_bwa_mapping = f"bwa mem -a -t {parsedArgs.threads} -A {parsedArgs.match_score} -B {parsedArgs.mismatch_penalty} {parsedArgs.out_dir}/BWA_index/concat_BWA {reads}" 
    #f"samtools sort @{parsedArgs.threads} -o concat_{rlen}_{read_layout}_sorted.bam -
    res = crossmapper.externalExec.execute(cmd_bwa_mapping,"BWA_mapping" , 
                                 tmpSamFile,
                                 None,
                                 outDir = f"{bwa_dir}")
    if not res.resCheck():
        sys.exit("Execution Failed")
    ## TODO :: samtools view -bS
    res = crossmapper.externalExec.execute(f"samtools sort -@{parsedArgs.threads} -o {finalBamFile} {tmpSamFile}",
                                        "samtools" ,
                                        outDir = f"{bwa_dir}")
    if not res.resCheck(stdoutRemove=True,stdErrRemove=True):
        sys.exit("Execution Failed")
    logger.info("Mapping is finished. " + f"Final bam file writen to {finalBamFile}")
    
    try :
        logger.debug(f"Deleteing tmp sam file {tmpSamFile}")
        os.remove(tmpSamFile)
    except :
        logger.warning(f"Can not delete temporary sam files {tmpSamFile}")
    
    
    logger.info("Starting Bam indexing.")
    cmd_samtools_index = f"samtools index {finalBamFile}"
    #print(cmd_samtools_index)
    res = crossmapper.externalExec.execute(cmd_samtools_index,
                                        "samtools_index",
                                        outDir = f"{bwa_dir}")
    if not res.resCheck(stdoutRemove=True,stdErrRemove=True):
        sys.exit("Execution Failed")
    parsedArgs.mappingOutputFiles[rlen][read_layout] = finalBamFile
    logger.info("Bam Indexing is finished.")
Exemplo n.º 9
0
def printOutputFileInfo(parsedArgs, step='All'):
    logger = getLogger()
    if step == "wgsim" or step == 'All':
        allArgsStr = "\n"
        for rlen, files in parsedArgs.simulationOutputFiles.items():
            allArgsStr += f"\t\t * {rlen} : {files}\n"
        logger.debug("wgsim output files : " + allArgsStr)
    if step == "mapping" or step == 'All':
        allArgsStr = "\n"
        for rlen, layout_files in parsedArgs.mappingOutputFiles.items():
            for layout, files in layout_files.items():
                allArgsStr += f"\t\t * {rlen} ({layout}) : {files}\n"
        logger.debug("mapping output files : " + allArgsStr)
Exemplo n.º 10
0
def concatGeneomes(parsedArgs):
    logger = getLogger()
    genome_list=[]
    for i in range(0,len(parsedArgs.chr_rename_fasta)):
        genome_list.append(parsedArgs.chr_rename_fasta[i])    
    genome_concat = ' '.join(genome_list)
    
    parsedArgs.genomeConcatFasta = f"{parsedArgs.out_dir}/concat.fasta"

#    cmd_genome_concat = f"cat {genome_concat} > {parsedArgs.out_dir}/concat.fasta"
    res = crossmapper.externalExec.execute(f"cat {genome_concat}",
                                  "cat",
                                  f"{parsedArgs.genomeConcatFasta}",
                                  None,
                                  f"{parsedArgs.out_dir}")
    if not res.resCheck():
        sys.exit("Execution fail")
Exemplo n.º 11
0
def createHTMLReport(resCounters, args):
    logger = getLogger()
    reportFilePath = os.path.join(args.out_dir, "report.html")
    logger.info(f"Creating Report File : {reportFilePath}")
    reportHTML = reportTemplete.render(
        headTemplate=headTemplate,
        contentTemplate=contentTemplate,
        barGroupChartTemplate=barGroupChartTemplate,
        lineChartTemplate=lineChartTemplate,
        seriesTemplate=seriesTemplate,
        drilldownTemplate=drilldownTemplate,
        barchart2DivtableTemplate=barchart2DivtableTemplate,
        barchart2Template=barchart2Template,
        counterRes=resCounters,
        args=args)

    with open(reportFilePath, "w+") as fh:
        fh.write(reportHTML)

    return
Exemplo n.º 12
0
def concatAnnotations(parsedArgs):
    logger = getLogger()
    if parsedArgs.simulation_type == "RNA":
        ### concatenate gtf files
        gtf_list = []
        for i in range(0, len(parsedArgs.genomes)):
            if parsedArgs.annotations[i].split(".")[-1] == "gtf":
                gtf_list.append(parsedArgs.annotations[i])
            else:
                gtf_name = getBaseName(parsedArgs.annotations[i]) + ".gtf"
                gtf_list.append(f"{parsedArgs.out_dir}/{gtf_name}")

        gtf_concat = ' '.join(gtf_list)

        #    cmd_gtf_concat = f"cat {gtf_concat} > {parsedArgs.out_dir}/concat.gtf"
        res = crossmapper.externalExec.execute(
            f"cat {gtf_concat}", "cat", f"{parsedArgs.out_dir}/concat.gtf",
            None, f"{parsedArgs.out_dir}")
        if not res.resCheck():
            sys.exit("Execution fail")
    parsedArgs.annotationsGTFConcat = f"{parsedArgs.out_dir}/concat.gtf"
Exemplo n.º 13
0
def readSimulation(parsedArgs, fasta_name, fasta_basename, file_number,
                   read_len):
    logger = getLogger()
    fasta_len = 0
    for rec in SeqIO.parse(f"{parsedArgs.genomeConcatFasta}", 'fasta'):
        fasta_len += len(rec.seq)

    parsedArgs.simDir = os.path.join(parsedArgs.out_dir, "wgsim_output")
    if os.path.isdir(f"{parsedArgs.simDir}") == False:
        logger.info(f"Creating {parsedArgs.simDir} directory.")
        os.makedirs(f"{parsedArgs.simDir}")
    ## if possible to assign, calculate N_reads, based on C, else use input value
    try:
        N_reads = round(parsedArgs.coverage[file_number] * fasta_len /
                        read_len)
    except Exception as ex:
        N_reads = parsedArgs.N_read[file_number]

    random_seed = int(parsedArgs.random_seed) + random.randint(1, 100000)

    cmd_wgsim = f"wgsim " \
f"-e {parsedArgs.error} " \
f"-d {parsedArgs.outer_dist} " \
f"-s {parsedArgs.s_dev} " \
f"-N {N_reads} " \
f"-1 {read_len} " \
f"-2 {read_len} " \
f"-r {parsedArgs.mut_rate} " \
f"-R {parsedArgs.indel_fraction} " \
f"-X {parsedArgs.indel_extend} " \
f"-S {random_seed} " \
f"-A {parsedArgs.discard_ambig} " \
f"{fasta_name} {parsedArgs.simDir}/{fasta_basename}_{read_len}_read1.fastq {parsedArgs.simDir}/{fasta_basename}_{read_len}_read2.fastq "

    crossmapper.externalExec.execute(cmd_wgsim,
                                     "cmd_wgsim",
                                     outDir=f"{parsedArgs.simDir}",
                                     overwrite=False)
    return cmd_wgsim
Exemplo n.º 14
0
def parseArgument(argumentParser):
    parsedArgs = argumentParser.parse_args()

    ## setup absole path for dir
    parsedArgs.out_dir = os.path.abspath(parsedArgs.out_dir)
    if os.path.isdir(parsedArgs.out_dir) != True:
        ## TODO :: create the folder here
        # cmd_mkdir = "mkdir ./%s"%(parsedArgs.out_dir)
        ## try and handle execption here
        os.makedirs(parsedArgs.out_dir)

    for i in range(0, len(parsedArgs.genomes)):
        if os.path.exists(parsedArgs.genomes[i]):
            if not os.path.getsize(parsedArgs.genomes[i]) > 0:
                sys.exit(
                    f"Error: {parsedArgs.genomes[i]} file is empty! Please provide a valid file."
                )
        else:
            sys.exit(
                f"Error: {parsedArgs.genomes[i]} file does not exist! Please provide a valid file."
            )


############### checking input
    if len(parsedArgs.genomes) <= 1:
        sys.exit(
            f"Error: Number of provided input genomes must be at least 2.")

    if parsedArgs.simulation_type == "RNA":
        if len(parsedArgs.genomes) != len(parsedArgs.annotations):
            sys.exit(
                f"Error: Number of provided input genomes files does not match number of input annotations files."
            )

    if parsedArgs.coverage is not None:
        if len(parsedArgs.coverage) == 1:
            for ic in range(1, len(parsedArgs.genomes)):
                parsedArgs.coverage.append(parsedArgs.coverage[0])
        if len(parsedArgs.genomes) > len(parsedArgs.coverage):
            sys.exit(
                f"Error: Provided Coverage (--coverage) options do not match the input genomes files. You should provide coverage for each input fasta file or just one coverage for all of them."
            )
    elif parsedArgs.N_read is not None:
        if len(parsedArgs.N_read) == 1:
            for ic in range(1, len(parsedArgs.genomes)):
                parsedArgs.N_read.append(parsedArgs.N_read[0])
        elif len(parsedArgs.genomes) > len(parsedArgs.N_read):
            sys.exit(
                f"Error: Provided  number of reads/read pairs to generate (--N_read) options do not match the input genomes files. You should provide one for each input fasta file or just one for all of them."
            )

    ### for renaming chr names
    parsedArgs.chr_rename_fasta = []
    for i in range(0, len(parsedArgs.genomes)):
        fasta_chr_rename = getBaseName(
            parsedArgs.genomes[i]) + "_rename" + ".fasta"
        parsedArgs.chr_rename_fasta.append(
            os.path.abspath(parsedArgs.out_dir) + "/" + fasta_chr_rename)
    #print(parsedArgs.chr_rename_fasta)

    ### for renaming chr names in gff
    if parsedArgs.simulation_type == "RNA":
        parsedArgs.chr_rename_gff = []
        for i in range(0, len(parsedArgs.annotations)):
            if parsedArgs.annotations[i][-3:] == "gtf":
                gff_chr_rename = getBaseName(
                    parsedArgs.annotations[i]) + "_rename" + ".gtf"
                parsedArgs.chr_rename_gff.append(
                    os.path.abspath(parsedArgs.out_dir) + "/" + gff_chr_rename)
            elif parsedArgs.annotations[i][-3:] == "gff":
                gff_chr_rename = getBaseName(
                    parsedArgs.annotations[i]) + "_rename" + ".gff"
                parsedArgs.chr_rename_gff.append(
                    os.path.abspath(parsedArgs.out_dir) + "/" + gff_chr_rename)

        #print(parsedArgs.chr_rename_gff)

    parsedArgs.fasta_names = []
    if parsedArgs.simulation_type == "RNA":
        for i in range(0, len(parsedArgs.chr_rename_fasta)):
            transcriptome_name = getBaseName(
                parsedArgs.chr_rename_fasta[i]) + "_transcriptome%s" % (
                    i + 1) + ".fasta"
            #            parsedArgs.fasta_names.append(os.path.abspath(transcriptome_name))
            parsedArgs.fasta_names.append(
                os.path.join(parsedArgs.out_dir, transcriptome_name))

        if len(parsedArgs.annotations) > 0:
            for i in range(0, len(parsedArgs.annotations)):
                if os.path.exists(parsedArgs.annotations[i]):
                    if not os.path.getsize(parsedArgs.annotations[i]) > 0:
                        sys.exit(
                            f"Error: {parsedArgs.annotations[i]} file is empty! Please provide a valid file."
                        )
                else:
                    sys.exit(
                        f"Error: {parsedArgs.annotations[i]} file does not exist! Please provide a valid file."
                    )

    else:
        for i in range(0, len(parsedArgs.chr_rename_fasta)):
            parsedArgs.fasta_names.append(
                os.path.abspath(parsedArgs.chr_rename_fasta[i]))
    #print(parsedArgs.fasta_names)

    ## check if not all values can be converted to int
    try:
        list(map(int, parsedArgs.read_length.split(",")))
    except Exception:
        sys.exit(
            "There are strings or floats in read length values. Please use only standard read lengths!"
        )

    ## convert list of strings to list of integers
    input_rlen = list(map(int, parsedArgs.read_length.split(",")))
    #print(input_rlen)

    ## check if there are duplicated lengths
    if not len(set(input_rlen)) == len(input_rlen):
        sys.exit("Error: read lengths shoud not be duplicated!")

    ## check if any length is not standard
    for length in input_rlen:
        #print(length)
        if not length in standard_rlen:
            sys.exit(
                "Error: input read length %s is not a standard Illumina read length."
                % (length) +
                "\nPlease refer to our help page (crossmap -h) to find standard read lengths."
            )

    parsedArgs.input_rlen = input_rlen

    ## other initilization

    parsedArgs.simulationOutputFiles = {}
    parsedArgs.mappingOutputFiles = {}
    ## default concat geneome fasta file name
    parsedArgs.simDir = f"{parsedArgs.out_dir}"
    parsedArgs.mappingDir = f"{parsedArgs.out_dir}"
    parsedArgs.genomeConcatFasta = f"{parsedArgs.out_dir}/concat.fasta"
    parsedArgs.annotationsGTFConcat = f"{parsedArgs.out_dir}/concat.gtf"
    ## setting internal variable to parsedArgs object
    parsedArgs.isDebug = __DEBUG__
    parsedArgs.logPrefix = "crossmap.log"
    parsedArgs.logFile = os.path.join(parsedArgs.out_dir, parsedArgs.logPrefix)
    if parsedArgs.verbose == "Debug":
        parsedArgs.isDebug = True
    parsedArgs.verbose = VerboseLevel.All
    ## option to report crossmapp reads info files
    # parsedArgs.reportCrossmapped = True

    if parsedArgs.genome_names is not None:
        parsedArgs.speciesPrefix = parsedArgs.genome_names
    else:
        parsedArgs.speciesPrefix = None

    if parsedArgs.speciesPrefix == None:
        ## get basename from the genome file
        parsedArgs.speciesPrefix = []
        for i in range(0, len(parsedArgs.genomes)):
            genomePrefix = getBaseName(parsedArgs.genomes[i])
            parsedArgs.speciesPrefix.append(genomePrefix)

    ## create specied Ids dict
    parsedArgs.speciesIds = {}
    for i in range(0, len(parsedArgs.speciesPrefix)):
        parsedArgs.speciesIds[parsedArgs.speciesPrefix[i]] = i

    if __DEBUG__:
        printArgs(parsedArgs)
    cmdLine = " ".join(sys.argv)
    setupLogger(parsedArgs)

    getLogger().info("Starting the program with  \"" + cmdLine + "\"")

    ## FIXE :: change this option to be global
    if parsedArgs.simulation_type == "DNA":
        parsedArgs.star_temp_dir = "./TMPs"

    if parsedArgs.mapper_template is None:
        if parsedArgs.simulation_type == "RNA":
            parsedArgs.mapper = STARMapper(parsedArgs)
        else:
            parsedArgs.mapper = BWAMapper(parsedArgs)
    else:
        if os.path.dirname(parsedArgs.mapper_template) == "":
            # if no path look at the current dir if not look to config folder of the module
            if not os.path.exists(parsedArgs.mapper_template):
                mappersConfigFolder = os.path.abspath(
                    os.path.dirname(os.path.realpath(__file__)) +
                    "/mappers_config/")
                mapperTemplatePath = os.path.join(mappersConfigFolder,
                                                  parsedArgs.mapper_template)
                if not os.path.exists(mapperTemplatePath):
                    ## see if we have an ext
                    if os.path.splitext(parsedArgs.mapper_template)[1] == "":
                        ## add yaml ext and try again
                        mapperTemplatePath = os.path.join(
                            mappersConfigFolder,
                            parsedArgs.mapper_template + ".yaml")
                        if not os.path.exists(mapperTemplatePath):
                            sys.exit(
                                f"Can not Find the mapper template {parsedArgs.mapper_template}."
                            )
                        else:
                            parsedArgs.mapper_template = mapperTemplatePath
                else:
                    parsedArgs.mapper_template = mapperTemplatePath

        with open(parsedArgs.mapper_template, 'r') as inputTemplate:
            try:
                configTemplate = yaml.safe_load(inputTemplate)
                parsedArgs.mapper = TemplateMapper(configTemplate, parsedArgs)
                getLogger().info(
                    f"Custom Mapper Tempalte {parsedArgs.mapper.mapperName} Will be used."
                )
                parsedArgs.mapper.checkDep()
            except yaml.YAMLError as exc:
                sys.exit("Can not Parse config Tempalte,  {0}".format(exc))
            except Exception as ex:
                getLogger().error(
                    "Error Can not use Custom Mapper,  {0}".format(
                        ex))  #raise ex
                sys.exit("Error Can not use Custom Mapper.")

    return parsedArgs
Exemplo n.º 15
0
def extractTranscriptome(parsedArgs):
    logger = getLogger()

    for i in range(0, len(parsedArgs.annotations)):
        if parsedArgs.annotations[i].split(".")[-1] == "gtf":
            logger.info(
                "Annotation file %s detected as gtf. Proceeding to transriptome extraction."
                % (os.path.basename(parsedArgs.annotations[i])))
            #get the transcriptome name
            transcriptome_name = getBaseName(
                parsedArgs.genomes[i]) + "_transcriptome%s" % (i +
                                                               1) + ".fasta"

            # extract the transcript

            cmd_gffread_extract = f"gffread " \
            f"-w {parsedArgs.out_dir}/{transcriptome_name} " \
            f"-g {parsedArgs.genomes[i]} " \
            f"{parsedArgs.annotations[i]}"

            #print(cmd_gffread_extract)
            res = crossmapper.externalExec.execute(
                cmd_gffread_extract,
                "gffreadExtract",
                outDir=f"{parsedArgs.out_dir}")
            if not res.resCheck():
                sys.exit("Execution fail")
            #gffread -w transcriptome_name -g parsedArgs.genomes[i] parsedArgs.annotations[i]
            logger.info("Transcriptome extracted for %s" %
                        (os.path.basename(parsedArgs.genomes[i])))

        elif parsedArgs.annotations[i].split(".")[-1] == "gff":

            logger.info(
                "Annotation file %s detected as gff. Converting to gtf using gffread."
                % (os.path.basename(parsedArgs.annotations[i])))

            #converting to gtf
            gtf_name = getBaseName(parsedArgs.annotations[i]) + ".gtf"

            cmd_gffread_convert = f"gffread " \
            f"{parsedArgs.annotations[i]} " \
            f"-T -o {parsedArgs.out_dir}/{gtf_name}"
            #print(cmd_gffread_convert)

            res = crossmapper.externalExec.execute(
                cmd_gffread_convert,
                "gffreadConvert",
                outDir=f"{parsedArgs.out_dir}")
            if not res.resCheck(stdoutRemove=True, stdErrRemove=True):
                sys.exit("Execution fail")
            #gffread parsedArgs.annotations[i] -T -o gtf_name

            logger.info(
                "GFF --> GTF conversion is done. Proceeding to transriptome extraction."
            )

            #get the transcriptome name
            transcriptome_name = getBaseName(
                parsedArgs.genomes[i]) + "_transcriptome%s" % (i +
                                                               1) + ".fasta"


            cmd_gffread_extract = f"gffread " \
            f"-w {parsedArgs.out_dir}/{transcriptome_name} " \
            f"-g {parsedArgs.genomes[i]} " \
            f"{parsedArgs.out_dir}/{gtf_name}"

            #print(cmd_gffread_extract)
            res = crossmapper.externalExec.execute(
                cmd_gffread_extract,
                "gffreadExtract",
                outDir=f"{parsedArgs.out_dir}")
            if not res.resCheck(stdoutRemove=True, stdErrRemove=True):
                sys.exit("Execution fail")
            # extract the transcript
            #gffread -w transcriptome_name -g parsedArgs.genomes[i] gtf_name
            logger.info("Transcriptome extracted for %s" %
                        (os.path.basename(parsedArgs.genomes[i])))
        else:
            logger.error(
                "Error: annotation file %s is neither gtf nor in gff. Please check the annotation file."
                % (os.path.basename(parsedArgs.annotations[i])))
            sys.exit("Execution Failed")
Exemplo n.º 16
0
def concateFastqFiles(parsedArgs, rlen):
    logger = getLogger()

    #for rlen in parsedArgs.input_rlen:
    genome_list_r1 = []
    genome_list_r2 = []
    for i in range(0, len(parsedArgs.genomes)):
        if parsedArgs.simulation_type == "RNA":
            read_1 = parsedArgs.simDir + "/" + getBaseName(
                parsedArgs.genomes[i]) + "_transcriptome" + str(
                    i + 1) + "_" + str(rlen) + "_read1.fastq"
            genome_list_r1.append(read_1)

            read_2 = parsedArgs.simDir + "/" + getBaseName(
                parsedArgs.genomes[i]) + "_transcriptome" + str(
                    i + 1) + "_" + str(rlen) + "_read2.fastq"
            genome_list_r2.append(read_2)
            #print(genome_list_r2)
        else:
            read_1 = parsedArgs.simDir + "/" + getBaseName(
                parsedArgs.genomes[i]) + "_" + str(rlen) + "_read1.fastq"
            genome_list_r1.append(read_1)

            read_2 = parsedArgs.simDir + "/" + getBaseName(
                parsedArgs.genomes[i]) + "_" + str(rlen) + "_read2.fastq"
            genome_list_r2.append(read_2)

    genome_concat1 = ' '.join(genome_list_r1)
    #cmd_read1_concat = f"cat {genome_concat1} > {parsedArgs.out_dir}/concat_{rlen}_read1.fastq"
    res = crossmapper.externalExec.execute(
        f"cat {genome_concat1}", "cat",
        f"{parsedArgs.simDir}/concat_{rlen}_read1.fastq", None,
        f"{parsedArgs.out_dir}")
    if not res.resCheck():
        sys.exit("Execution fail")

    genome_concat2 = ' '.join(genome_list_r2)

    # cmd_read2_concat = f"cat {genome_concat2} > {parsedArgs.simDir}/concat_{rlen}_read2.fastq"
    if parsedArgs.read_layout != "SE":
        res = crossmapper.externalExec.execute(
            f"cat {genome_concat2}", "cat",
            f"{parsedArgs.simDir}/concat_{rlen}_read2.fastq", None,
            f"{parsedArgs.out_dir}")
        if not res.resCheck():
            sys.exit("Execution fail")
#    else: ## no need ??
#        ## remove right reads files
#        try :
#            logger.debug(f"Removeing simulated reads 2 from wgsim {parsedArgs.out_dir}/concat_{rlen}_read2.fastq")
#            os.remove(f"{parsedArgs.out_dir}/concat_{rlen}_read2.fastq")
#        except:
#            logger.warning(f"Can not remove unwanted reads file  {parsedArgs.out_dir}/concat_{rlen}_read2.fastq")

    parsedArgs.simulationOutputFiles[rlen].append(
        f"{parsedArgs.simDir}/concat_{rlen}_read1.fastq")
    if parsedArgs.read_layout != "SE":
        parsedArgs.simulationOutputFiles[rlen].append(
            f"{parsedArgs.simDir}/concat_{rlen}_read2.fastq")

    ## cleanning temp files
    tmpFiles = []
    tmpFiles.extend(genome_list_r1)
    tmpFiles.extend(genome_list_r2)
    for tmpFile in tmpFiles:
        try:
            logger.debug(f"Deleteing tmp file {tmpFile}")
            os.remove(tmpFile)
            logger.debug(f"tmp file {tmpFile} delete")
        except Exception:
            logger.error(f"Can not delete tmp file {tmpFile}", exc_info=True)
Exemplo n.º 17
0
def starMapping(parsedArgs,reads,rlen,read_layout):
    logger = getLogger()
    overhang = rlen - 1
    
    star_dir = f"{parsedArgs.out_dir}/star_output"
    parsedArgs.mappingDir = star_dir
    if os.path.isdir(f"{star_dir}") == False:
        logger.info(f"Creating {star_dir} directory.")
        os.makedirs(f"{star_dir}")

    logger.info("Starting STAR mapping.")
    
    if parsedArgs.bacterial_mode is True:
        intron_len_max=1
    else:
        intron_len_max=0
        
    cmd_star_mapping = "STAR " \
f"--runThreadN {parsedArgs.threads} " \
f"--genomeDir {parsedArgs.starIndex} " \
f"--sjdbGTFfile {parsedArgs.out_dir}/concat.gtf " \
f"--sjdbOverhang {overhang} " \
f"--readFilesIn {reads} " \
"--readFilesCommand cat --outSAMtype BAM Unsorted " \
f"--outFileNamePrefix {star_dir}/concat_{rlen}_{read_layout}_ " \
f"--outFilterMismatchNmax {parsedArgs.outFilterMismatchNmax} " \
f"--outFilterMultimapNmax 10000 " \
f"--outFilterMismatchNoverReadLmax {parsedArgs.outFilterMismatchNoverReadLmax} " \
f"--alignIntronMax {intron_len_max} " \
f"--outTmpDir {parsedArgs.star_temp_dir}"

    # print(cmd_star_mapping)
    res = crossmapper.externalExec.execute(cmd_star_mapping,"STAR_mapping" , outDir = f"{star_dir}", overwrite = False)
    if not res.resCheck(clean = False):
        sys.exit("Execution Failed")
    
    logger.info("Mapping is finished. Started bam file sorting and indexing.")
    
    
    finalBamFile = f"{star_dir}/concat_{rlen}_{read_layout}_sorted.bam"
    tmpBamFile = f"{star_dir}/concat_{rlen}_{read_layout}_Aligned.out.bam"
    cmd_samtools_sort = "samtools sort " \
f"-@{parsedArgs.threads} " \
f"-o {finalBamFile} {tmpBamFile}"

    # print(cmd_samtools_sort)
    res  = crossmapper.externalExec.execute(cmd_samtools_sort,"samtools_sort" , outDir = f"{star_dir}")
    if not res.resCheck(stdoutRemove=True,stdErrRemove=True):
        sys.exit("Execution Failed")
    
    logger.info("Sorting is finished. " +f"Final bam file writen to {finalBamFile}")
    
    
    try :
        logger.debug(f"Deleteing tmp sam file {tmpBamFile}")
        os.remove(tmpBamFile)
    except :
        logger.warning(f"Can not delete temporary bam file {tmpBamFile}")
    
    logger.info("Starting bam indexing.")
    cmd_samtools_index = f"samtools index {finalBamFile}"
    # print(cmd_samtools_index)
    res = crossmapper.externalExec.execute(cmd_samtools_index,"samtools_index" , outDir = f"{star_dir}")
    
    
    
    
    if not res.resCheck(stdoutRemove=True,stdErrRemove=True):
        sys.exit("Execution Failed")
    logger.info("Bam Indexing is finished.")

        
        
    parsedArgs.mappingOutputFiles[rlen][read_layout] = finalBamFile
Exemplo n.º 18
0
def getReadCounters(args):
    logger = getLogger()
    speciesIds = args.speciesIds  # { org1Name : 0, org2Name  :1}

    logger.info("Reading Sequence Directory from Fasta files")

    spInputFastaFiles = args.genomes

    allSeqs = getSequencesPerOrganisms(
        spInputFastaFiles)  ## testing was [sp1InputFasta,sp2InputFasta]
    seqsIndex = createSequenceIndex(allSeqs)
    seqsIndexToSeq = dict((v, k) for k, v in seqsIndex.items())
    seqToOrg = sequenceToOrganism(allSeqs)
    transcriptMap = None
    if args.simulation_type == "RNA":
        logger.info("Reading GTF/GFF files for transcripts info ... ")
        transcriptMap = mapTranscriptToSequence(
            [args.annotationsGTFConcat],
            seqsIndex)  ## test was [sp1InputGTF, sp2InputGTF]

    counters = {}
    reportCorssmappedReadFiles = None
    if args.reportCrossmapped:
        reportCorssmappedReadFiles = {}
        ## create file here and store them in dic
        crossmapReadsDirName = args.out_dir + "/crossmapped_reads"
        if os.path.isdir(crossmapReadsDirName) != True:
            os.makedirs(crossmapReadsDirName)
        for spName in args.speciesPrefix:
            outfilename = crossmapReadsDirName + "/" + spName
            reportReadFile = open(outfilename, "w+")
            reportCorssmappedReadFiles[spName] = reportReadFile

    ## new code here

    outputFile = open(os.path.join(args.out_dir, "report.txt"), "w")
    for rlen, layout_files in args.mappingOutputFiles.items():
        counters[rlen] = {}
        for layout, inBamFileName in layout_files.items():
            logger.info(
                f"Start counting reads for read lenghth {rlen} and ({layout}) layout:"
            )

            bamFile = pysam.AlignmentFile(inBamFileName, "rb")
            ## chech if bamFile has NH tag or not , if not calc it in advance and pass it to the count method
            NHTags = checkNHTag(bamFile)
            allCounter, reads = countReads(
                bamFile,
                speciesIds,
                seqsIndexToSeq,
                seqToOrg,
                rlen=rlen,
                layout=layout,
                transcriptMap=transcriptMap,
                nhTag=NHTags,
                reportReadFiles=reportCorssmappedReadFiles)
            del NHTags
            bamFile.close()
            counters[rlen][layout] = allCounter

            outputFile.write(
                f"Summary Counter for lenghth {rlen} and ({layout}) layout : {inBamFileName}\n"
            )
            allCounter.summary(outputFile)
            outputFile.write("*" * 50 + "\n\n")

    outputFile.close()
    if args.reportCrossmapped:
        ## close files
        for spName in args.speciesPrefix:
            reportCorssmappedReadFiles[spName].close()
    createHTMLReport(counters, args)

    return counters