Пример #1
0
def braker_aat(queue, ref, bamFile, species_name, protein_evidence, threads, fungus, list_fasta_names, wd, braker_out, verbose):
    '''Handles Braker and AAT so that we can run them in parallel'''
    # DIVIDE THREADS BY 2
    use = (round(int(threads) / 2) - 1)
    aat_wd = wd + 'AAT/'
    logistic.check_create_dir(aat_wd)
    while True:
        dummy = queue.get()
        if dummy == 0:
            transcripts.braker_call(braker_out, ref, bamFile, species_name, use, fungus, verbose)
        if dummy == 1:
            multiple.aat_multi(use, protein_evidence, list_fasta_names, aat_wd, verbose)
        queue.task_done()
Пример #2
0
def braker_aat(queue, ref, bamFile, species_name, protein_evidence, threads,
               fungus, list_fasta_names, wd, braker_out, verbose):
    '''Handles Braker and AAT so that we can run them in parallel'''
    # DIVIDE THREADS BY 2
    use = (round(int(threads) / 2) - 1)
    aat_wd = wd + 'AAT/'
    logistic.check_create_dir(aat_wd)
    while True:
        dummy = queue.get()
        if dummy == 0:
            transcripts.braker_call(braker_out, ref, bamFile, species_name,
                                    use, fungus, verbose)
        if dummy == 1:
            multiple.aat_multi(use, protein_evidence, list_fasta_names, aat_wd,
                               verbose)
        queue.task_done()
Пример #3
0
def august_gmes_exonerate(queue, ref, species, protein_evidence, threads, fungus, list_fasta_names, wd, exonerate_wd, verbose):
    use = (round(int(threads) / 3)-1)
    use_gmes = str(use)
    augustus_wd = wd + 'augustus/'
    logistic.check_create_dir(augustus_wd)
    gmes_wd = wd + 'gmes/'
    logistic.check_create_dir(gmes_wd)
    while True:
        dummy = queue.get()
        if dummy == 0:
            multiple.augustus_multi(use, species, list_fasta_names, augustus_wd, verbose)
        if dummy == 1:
            protein.protAlign(ref, protein_evidence, use, exonerate_wd, verbose)
        if dummy == 2:
            transcripts.gmes_call(gmes_wd, ref, fungus, use_gmes, verbose)
        queue.task_done()
Пример #4
0
def star(reference, fastq_reads, threads, max_intron_length, wd, verbose):
    '''Calls the mapper STAR to map fastq_reads to reference.
    First builds the reference index, then maps'''
    # Create dir for genome index
    refer = reference.split('/')[-1]
    genome_dir = wd + refer + '_STARindex/'
    logistic.check_create_dir(genome_dir)
    # Build the reference
    sys.stdout.write('###BUILD INDEX###\n')
    star_build(reference, genome_dir, threads, wd, verbose)

    # Mapping
    sys.stdout.write('###MAP###\n')
    out_file = star_map(fastq_reads, threads, genome_dir, max_intron_length,
                        wd, verbose)
    return out_file
Пример #5
0
def star(reference, fastq_reads, threads, max_intron_length, wd, verbose):
    '''Calls the mapper STAR to map fastq_reads to reference.
    First builds the reference index, then maps'''
    # Create dir for genome index
    refer = reference.split('/')[-1]
    genome_dir = wd + refer + '_STARindex/'
    logistic.check_create_dir(genome_dir)
    # Build the reference
    sys.stdout.write('\t###BUILD INDEX###\n')
    star_build(reference, genome_dir, threads, wd, verbose)

    # Mapping
    sys.stdout.write('\t###MAP###\n')
    out_file = star_map(
        fastq_reads,
        threads,
        genome_dir,
        max_intron_length,
        wd, verbose)
    return out_file
Пример #6
0
def august_gmes_aat(queue, ref, species, protein_evidence, threads, fungus, list_fasta_names, wd, verbose):
    use = (round(int(threads) / 3)-1)
    use_gmes = str(use)
    augustus_wd = wd + 'augustus/'
    logistic.check_create_dir(augustus_wd)
    gmes_wd = wd + 'gmes/'
    logistic.check_create_dir(gmes_wd)
    aat_wd = wd + 'AAT/'
    logistic.check_create_dir(aat_wd)
    while True:
        dummy = queue.get()
        if dummy == 0:
            multiple.augustus_multi(use, species, list_fasta_names, augustus_wd, verbose)
        if dummy == 1:
            multiple.aat_multi(use, protein_evidence, list_fasta_names, aat_wd, verbose)
        if dummy == 2:
            transcripts.gmes_call(gmes_wd, ref, fungus, use_gmes, verbose)
        queue.task_done()
Пример #7
0
def main():

    fmtdate = '%H:%M:%S %d-%m'
    now = datetime.datetime.now().strftime(fmtdate)
    home = os.path.expanduser("~")
    args = arguments.setting()
    if args.pasa_db == "":
        pasadb = ''.join(random.sample(string.ascii_lowercase, 5))
    else:
        pasadb = args.pasa_db
    augustus_species = logistic.augustus_species_func()

    if not augustus_species.get(args.species) and args.long_reads == "" and args.short_reads == "":
        sys.exit("#####PLEASE DEFINE A SPECIES NAME OR ANY KIND OF RNA-SEQ AND RE-RUN\t" + now + "\t#####\n")
    max_threads = multiprocessing.cpu_count()
    gmap_name = args.reference + '_GMAPindex'
    pasa_name = 'assembler-' + pasadb
    if args.upgrade == "":
        protein_loc = os.path.abspath(args.proteins)
    iprscan_log = iprscan.check_iprscan()
    # Useful variables for later
    root = os.getcwd()

    #if args.out_dir != "":# and args.out_dir.startswith("/"):
    #    output_dir = os.path.join(root, "LoReAn" + args.out_dir)
    #else:
    output_dir = os.path.join(root, "LoReAn_" + args.out_dir)
    logistic.check_create_dir(output_dir)
    if args.keep_tmp or args.verbose:
        wd = os.path.join(output_dir, "run/")
        logistic.check_create_dir(wd)
    else:
        temp_dir = tempfile.TemporaryDirectory(prefix='run_', dir=output_dir, suffix="/", )
        wd = temp_dir.name

    if args.upgrade == "":
        #if not os.path.isfile(home + "/.gm_key"):
        #    sys.exit("#####LOREAN STOPS HERE. CHECK THAT THE gm_key IS IN THE HOME FOLDER#####\n")
        if args.proteins == "":
            if not args.keep_tmp or not args.verbose:
                shutil.rmtree(wd)
            sys.exit("#####LOREAN STOPS HERE. CHECK THAT THE PROTEIN OPTION IS SET#####\n")
    if args.long_reads != "":
        if args.stranded or args.adapter:
            if args.adapter == '':
                adapter_value = True
                sys.stdout.write('### RUNNING IN STRAND MODE AND FINDING ADAPTER AUTOMATICALLY ###\n')
                stranded_value = True
            else:
                adapter_value = args.adapter
                sys.stdout.write('### RUNNING IN STRAND MODE AND USING ADAPTER PROVIDED ###\n')
                stranded_value = True
        else:
            stranded_value = False
            sys.stdout.write('### RUNNING IN NON-STRAND MODE ###\n')
            adapter_value = False
    ref_orig = os.path.abspath(args.reference)
    ref_link = os.path.join(wd, args.reference)
    if not os.path.exists(ref_link):
        shutil.copyfile(ref_orig, ref_link)
    long_reads = args.long_reads
    fasta = (".fasta", ".fa", ".fas", ".fsta")
    fastq = (".fastq", ".fq")
    '''Core of the program'''
    # Parse the arguments
    if int(args.threads) > max_threads:
        threads_use = str(max_threads)
        sys.stdout.write(('### MAX NUMBER OF USED THREADS IS ' + str(max_threads) + ' AND NOT ' + args.threads + ' AS SET ###\n'))
    else:
        threads_use = args.threads
    if args.external:
        external_file = args.external
    else:
        external_file = ''
    if args.upgrade == "":
        if args.species == "":
            sys.exit("#####PLEASE DEFINE A SPECIES NAME\t" + now + "\t#####\n")
        else:
            if args.short_reads == '' and long_reads == '':
                if external_file.endswith("gff3") or external_file.endswith(fasta):
                    weights_dic = {'Augustus': args.augustus_weigth, 'GeneMark.hmm': args.genemark_weigth, 'exonerate': args.exonerate_weigth,
                                   'external' : args.external_weigth}
                else:
                    weights_dic = {'Augustus': args.augustus_weigth, 'GeneMark.hmm': args.genemark_weigth, 'exonerate': args.exonerate_weigth}
            elif args.short_reads != '' or long_reads != '':
                if external_file.endswith("gff3") or external_file.endswith(fasta):
                    weights_dic = {'Augustus': args.augustus_weigth, pasa_name: args.pasa_weigth, 'GeneMark.hmm': args.genemark_weigth,
                                   'exonerate': args.exonerate_weigth, gmap_name: args.trinity_weigth, 'external' : args.external_weigth}
                else:
                    weights_dic = {'Augustus': args.augustus_weigth, pasa_name: args.pasa_weigth, 'GeneMark.hmm': args.genemark_weigth,
                               'exonerate': args.exonerate_weigth, gmap_name: args.trinity_weigth}
    final_files = []  # STORE THE IMPORTANT OUTPUT FILES
    logistic.check_create_dir(wd)
    logistic.check_file(ref_link)
    gmap_wd = os.path.join(wd ,'gmap_output/')
    exonerate_wd = os.path.join(wd , 'exonerate')
    pasa_dir = os.path.join(wd , 'PASA/')
    star_out = os.path.join(wd , 'STAR/')
    trin_dir = os.path.join(wd , 'Trinity/')
    evm_inputs_dir = os.path.join(wd , 'evm_inputs/')
    braker_folder = os.path.join(wd , 'braker/')
    evm_output_dir = os.path.join(wd , 'evm_output/')
    interproscan_out_dir = os.path.join(wd , 'interproscan')
    wd_split = os.path.join(wd , 'split/')
    logistic.check_create_dir(wd_split)
    logistic.check_create_dir(evm_inputs_dir)
    logistic.check_create_dir(evm_output_dir)
    logistic.check_create_dir(trin_dir)
    logistic.check_create_dir(star_out)
    logistic.check_create_dir(pasa_dir)
    logistic.check_create_dir(gmap_wd)
    logistic.check_create_dir(exonerate_wd)
    if args.interproscan:
        logistic.check_create_dir(interproscan_out_dir)
    if long_reads:
        consensus_wd = os.path.join(wd , 'consensus/')
        logistic.check_create_dir(consensus_wd)
    if long_reads != "" or args.short_reads != "":
        logistic.check_gmap(threads_use, 'samse', args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                        args.verbose)

    if args.repeat_masked != "":
        sys.stdout.write(('###MASKING THE GENOME STARTED AT:\t' + now + '\t###\n'))
        masked_ref = mseq.maskedgenome(wd_split, ref_link, args.repeat_masked, args.repeat_lenght, args.verbose)
    elif args.mask_genome:
        sys.stdout.write(('###RUNNNG REPEATSCOUT AND REPEATMASK TO MASK THE GENOME STARTED AT:\t' + now + '\t###\n'))
        masked_ref, repeats_families, repeats_gff = mseq.repeatsfind(ref_link, wd_split, threads_use, args.verbose)
        if os.path.exists(repeats_families):
            final_files.append(repeats_families)
        if os.path.exists(repeats_gff):
            final_files.append(repeats_gff)
    else:
        masked_ref = ref_link
    list_fasta_names, dict_ref_name, ref_rename = multiple.single_fasta(masked_ref, wd_split)
    if args.short_reads or long_reads:
        if int(threads_use) > 1:
            trinity_cpu = int(int(threads_use) / int(2))
        else:
            trinity_cpu = int(threads_use)
        now = datetime.datetime.now().strftime(fmtdate)
        # SHORT READS
        if args.short_reads.endswith(fastq):
            sys.stdout.write(('###STAR MAPPING  STARTED AT:\t' + now + '\t###\n'))
            if ',' in args.short_reads:
                paired_end_files = args.short_reads.split(',')
                short_1 = os.path.abspath(paired_end_files[0])
                short_2 = os.path.abspath(paired_end_files[1])
                short_reads_file = [short_1, short_2]
            else:
                short_reads_file = os.path.abspath(args.short_reads)
            # Map with STAR
            short_bam = mapping.star(ref_rename, short_reads_file, threads_use, args.max_intron_length, star_out,
                                     args.verbose)
            short_sorted_bam = mapping.samtools_sort(short_bam, threads_use, wd, args.verbose)
            final_mapping_star = mapping.change_chr(short_sorted_bam, dict_ref_name, star_out, threads_use, args.verbose, "short")
            default_bam = short_sorted_bam
            # Keep the output
            final_files.append(final_mapping_star)
            # TRANSCRIPT ASSEMBLY
            # TRINITY
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###TRINITY STARTS AT:\t' + now + '\t###\n'))
            trinity_out = transcripts.trinity(short_sorted_bam, trin_dir, args.max_intron_length, trinity_cpu, args.verbose)
            if args.upgrade == "":
                trinity_gff3 = mapping.gmap('trin', ref_rename, trinity_out, threads_use, 'gff3_gene',
                                    args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                    args.verbose, Fflag=True)
                trinity_path = trinity_gff3
            long_sorted_bam = False
        # BAM SORTED FILES GET IN HERE
        elif args.short_reads.endswith("bam") or long_reads.endswith("bam"):
            logistic.check_create_dir(star_out)
            if args.short_reads.endswith("bam"):
                map_reads = os.path.abspath(args.short_reads)
                short_sorted_bam = mapping.change_chr_to_seq(map_reads, dict_ref_name, star_out, threads_use, args.verbose)
            else:
                map_reads = os.path.abspath(long_reads)
                short_sorted_bam = mapping.change_chr_to_seq(map_reads, dict_ref_name, star_out, threads_use, args.verbose)
                mapping.samtools_index(short_sorted_bam, star_out, args.verbose)
                long_reads = transcripts.bamtofastq(short_sorted_bam, args.verbose)
            #short_sorted_bam = os.path.abspath(args.short_reads)
            default_bam = short_sorted_bam
            # TRANSCRIPT ASSEMBLY
            # TRINITY
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###TRINITY STARTS AT:\t' + now + '\t###\n'))
            trinity_out = transcripts.trinity(short_sorted_bam, trin_dir, args.max_intron_length, trinity_cpu, args.verbose)
            if args.upgrade == "":
                trinity_gff3 = mapping.gmap('trin', ref_rename, trinity_out, threads_use, 'gff3_gene',
                                        args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                        args.verbose, Fflag=True)
                trinity_path = trinity_gff3
            long_sorted_bam = False
        # LONG READS
        elif long_reads.endswith(fastq) or long_reads.endswith(fasta):
            # with this operation, reads are filtered for their length.
            # Nanopore reads can be chimeras or sequencing artefacts.
            # filtering on length reduces the amount of sequencing
            # artefacts
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(("###FILTERING OUT LONG READS STARTED AT:\t" + now + "\t###\n"))
            long_fasta, stranded_value = mseq.filterLongReads(long_reads, args.assembly_overlap_length, args.max_long_read, gmap_wd,
                                              adapter_value, threads_use, args.adapter_match_score, ref_rename,
                                              args.max_intron_length, args.verbose, stranded_value)


                # If short reads have been mapped dont do it
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###GMAP\t' + now + 't###\n'))
            if args.minimap2:
                long_sam = mapping.minimap(ref_rename, long_fasta, threads_use, args.max_intron_length, gmap_wd, args.verbose)
            else:
                long_sam = mapping.gmap('sam', ref_rename, long_fasta, threads_use, 'samse',
                                    args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                    args.verbose, Fflag=False)

            # Convert to sorted BAM
            long_sorted_bam = mapping.sam_to_sorted_bam(long_sam, threads_use, gmap_wd, args.verbose)
            sam_orig_id = mapping.change_chr(long_sorted_bam, dict_ref_name, gmap_wd, threads_use, args.verbose, "long")
            default_bam = long_sorted_bam
            # Keep the output

            final_files.append(sam_orig_id)
            # TRANSCRIPT ASSEMBLY
            # TRINITY
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###TRINITY STARTS AT:\t' + now + '\t###\n'))
            trinity_out = transcripts.trinity(long_sorted_bam, trin_dir, args.max_intron_length, trinity_cpu, args.verbose)
            if args.upgrade == "":
                trinity_gff3 = mapping.gmap('trin', ref_rename, trinity_out, threads_use, 'gff3_gene',
                                        args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                        args.verbose, Fflag=True)
                trinity_path = trinity_gff3
        else:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###NO LONG READS FILE OR SHORT READS\t' + now + '\t###\n'))
        # PASA Pipeline
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('###PASA STARTS AT:\t' + now + '\t###\n'))
        # Create PASA folder and configuration file
        #align_pasa_conf = pasa.pasa_configuration(pasa_dir, pasadb, args.verbose)
        # Launch PASA

        if args.upgrade == "":
            #if os.path.isfile(home + "/.gm_key") and args.proteins != "":
            if args.proteins != "":
                pasa_gff3 = pasa.pasa_call(pasa_dir, pasadb, ref_rename, trinity_out, args.max_intron_length,
                                           threads_use, args.verbose)
                final_files.append(grs.trasform_gff(pasa_gff3, dict_ref_name))
        # HERE WE PARALLELIZE PROCESSES WHEN MULTIPLE THREADS ARE USED
                if args.species in augustus_species:
                    now = datetime.datetime.now().strftime(fmtdate)
                    sys.stdout.write(('###AUGUSTUS, GENEMARK-ES AND EXONERATE STARTED AT:' + now + '\t###\n'))
                    queue = Queue()
                    for software in range(3):
                        queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                        for software in range(3):
                            t = Thread(target=handler.august_gmes_exonerate, args=(queue, ref_rename, args.species, protein_loc,
                                                                           threads_use, args.fungus, list_fasta_names, wd, exonerate_wd,
                                                                           args.verbose))
                            t.daemon = True
                            t.start()
                    queue.join()
                    augustus_file = wd + 'augustus/augustus.gff'
                    augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                    final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
                    genemark_file = wd + 'gmes/genemark.gtf'
                    genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                    final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
                    merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
                    final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))

                elif args.short_reads or long_reads:  # USING PROTEINS AND SHORT READS
                    logistic.check_create_dir(braker_folder)
                    now = datetime.datetime.now().strftime(fmtdate)
                    sys.stdout.write(('###BRAKER1 (USING SHORT READS) AND EXONERATE STARTED AT:\t' + now + '\t###\n'))
                    queue = Queue()
                    for software in range(2):
                        queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                        for software in range(2):
                            t = Thread(target=handler.braker_exonerate, args=(queue, ref_rename, default_bam, args.species, protein_loc,
                                                                       threads_use, args.fungus, wd,
                                                                        braker_folder, exonerate_wd, args.verbose))
                            t.daemon = True
                            t.start()
                    queue.join()
                    augustus_file, genemark_file = inputEvm.braker_folder_find(braker_folder)
                    augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                    genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                    merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
                    final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))

                else:  # USING PROTEINS AND LONG READS
                    queue = Queue()
                    now = datetime.datetime.now().strftime(fmtdate)
                    sys.stdout.write(('###BRAKER1 (USING LONG READS) AND EXONERATE STARTED AT: \t' + now + '\t###\n'))
                    logistic.check_create_dir(braker_folder)
                    for software in range(2):
                        queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                        for software in range(2):
                            t = Thread(target=handler.braker_exonerate,
                                       args=(queue, ref_rename, long_sorted_bam, args.species, protein_loc,
                                             threads_use, args.fungus, wd, braker_folder, exonerate_wd, args.verbose))
                            t.daemon = True
                            t.start()
                    queue.join()
                    augustus_file, genemark_file = inputEvm.braker_folder_find(braker_folder)
                    augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                    genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                    merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
                    final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))
    elif args.species in augustus_species or args.species != "" or args.upgrade != "":
        #if os.path.isfile(home + "/.gm_key") and args.proteins != "":
        if args.proteins != "":
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###AUGUSTUS, GENEMARK-ES AND EXONERATE STARTED AT:' + now + '\t###\n'))
            queue = Queue()
            for software in range(3):
                queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                for software in range(3):
                    t = Thread(target=handler.august_gmes_exonerate, args=(queue, ref_rename, args.species, protein_loc,
                                                                   threads_use, args.fungus, list_fasta_names, wd, exonerate_wd,
                                                                   args.verbose))
                    t.daemon = True
                    t.start()
            queue.join()
            augustus_file = wd + 'augustus/augustus.gff'
            augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
            genemark_file = wd + 'gmes/genemark.gtf'
            genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
            merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
            final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
            final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
            final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))
    else:
        now = datetime.datetime.now().strftime(fmtdate)
        sys.exit("#####UNRECOGNIZED SPECIES FOR AUGUSTUS AND NO READS\t" + now + "\t#####\n")
    # Prepare EVM input files
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(('###EVM STARTED AT:\t' + now + '\t###\n'))
    # HERE WE CONVERT FILES FOR EVM AND PLACE THEM IN INPUT FOLDER
    round_n = 0
    if args.upgrade == "":
        if not args.short_reads and not long_reads:
            if external_file:
                if external_file.endswith(fasta):
                    external_file_gff3 = mapping.gmap('ext', ref_rename, external_file, threads_use, 'gff3_gene',
                                                      args.min_intron_length, args.max_intron_length, args.end_exon,
                                                      gmap_wd, args.verbose, Fflag=True)
                    external_file_changed = update.external(external_file_gff3, gmap_wd, args.verbose)
                elif external_file.endswith("gff3"):
                    external_file_changed = update.external(external_file, gmap_wd, args.verbose)
                evm_inputs = {'augustus': augustus_gff3, 'genemark': genemark_gff3, 'exonerate': merged_prot_gff3,
                              'external': external_file_changed}
            else:
                evm_inputs = {'augustus': augustus_gff3, 'genemark': genemark_gff3, 'exonerate': merged_prot_gff3}
        elif args.short_reads or long_reads:
            if args.external:
                external_file = args.external
                if external_file.endswith(fasta):
                    external_file_gff3 = mapping.gmap('ext', ref_rename, external_file, threads_use, 'gff3_gene',
                                                      args.min_intron_length, args.max_intron_length, args.end_exon,
                                                      gmap_wd, args.verbose, Fflag=True)
                    external_file_changed = update.external(external_file_gff3, gmap_wd, args.verbose)
                elif external_file.endswith("gff3"):
                    external_file_changed = update.external(external_file, gmap_wd, args.verbose)
                evm_inputs = {'pasa': pasa_gff3, 'augustus': augustus_gff3, 'genemark': genemark_gff3,
                                  'exonerate': merged_prot_gff3, 'gmap': trinity_path,'external': external_file_changed}
            else:
                evm_inputs = {'pasa': pasa_gff3, 'augustus': augustus_gff3, 'genemark': genemark_gff3,
                              'exonerate': merged_prot_gff3, 'gmap': trinity_path}
        # HERE WE RUN EVM; WE PREPARE FILES THAT ARE REQUIRED BY EVM LIKE
        # WEIGTH TABLE

        list_soft, pred_file, transcript_file, protein_file = inputEvm.group_EVM_inputs(evm_inputs_dir, evm_inputs)
        weight_file = inputEvm.evm_weight(evm_inputs_dir, weights_dic, list_soft, pasa_name, gmap_name)
        # EVM PIPELINE


        if args.short_reads or long_reads:  # WE HAVE SHORT READS AND PROTEINS
            evm_gff3 = evmPipeline.evm_pipeline(evm_output_dir, threads_use, ref_rename, weight_file, pred_file,
                                                transcript_file, protein_file, args.segmentSize, args.overlap_size,
                                                args.verbose)
            final_evm = grs.genename_evm(evm_gff3, args.verbose, evm_output_dir, dict_ref_name, args.upgrade)
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n'))
            round_n += 1
            final_output = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb, ref_rename, trinity_out,
                                              final_evm, args.verbose)
            if long_reads == '':
                final_update_all = grs.genename_last(final_output, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "pasa")
                final_update_stats = evmPipeline.gff3_stats(final_update_all, pasa_dir)
                final_files.append(final_update_all)
                final_files.append(final_update_stats)
                if "command" not in (iprscan_log.decode("utf-8")) and args.interproscan:
                    annot, bad_models = iprscan.iprscan(masked_ref, final_update_all, interproscan_out_dir, args.threads)
                    final_files.append(annot)
                    final_files.append(bad_models)
                final_output_dir = os.path.join(output_dir, args.out_dir + '_output')
                logistic.check_create_dir(final_output_dir)
                for filename in final_files:
                    if filename != '':
                        logistic.copy_file(filename, final_output_dir)
                cmdstring = "chmod -R 775 %s" % wd
                os.system(cmdstring)
                now = datetime.datetime.now().strftime(fmtdate)
                sys.exit("#####LOREAN FINISHED WITHOUT USING LONG READS\t" + now + "\t. GOOD BYE.#####\n")

            else:
                final_keep = grs.genename_last(final_output, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "pasa")
                final_keep_stats = evmPipeline.gff3_stats(final_keep, pasa_dir)
                final_files.append(final_keep)
                final_files.append(final_keep_stats)
        elif not args.short_reads and not long_reads:  # WE HAVE PROTEINS BUT NOT SHORT READS
            transcript_file = ''
            evm_gff3 = evmPipeline.evm_pipeline(evm_output_dir, threads_use, ref_rename, weight_file, pred_file,
                                                transcript_file, protein_file, args.segmentSize, args.overlap_size,
                                                args.verbose)
            final_update_all = grs.genename_last(evm_gff3, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "pasa")
            final_update_stats = evmPipeline.gff3_stats(final_update_all, pasa_dir)
            final_files.append(final_update_all)
            final_files.append(final_update_stats)
            now = datetime.datetime.now().strftime(fmtdate)
            if "command" not in (iprscan_log.decode("utf-8")) and args.interproscan:
                annot, bad_models = iprscan.iprscan(masked_ref, final_update_all, interproscan_out_dir, args.threads)
                final_files.append(annot)
                final_files.append(bad_models)
            final_output_dir = os.path.join(output_dir, args.out_dir + '_output')
            logistic.check_create_dir(final_output_dir)
            for filename in final_files:
                if filename != '':
                    logistic.copy_file(filename, final_output_dir)
            cmdstring = "chmod -R 775 %s" % wd
            os.system(cmdstring)
            now = datetime.datetime.now().strftime(fmtdate)
            sys.exit("##### EVM FINISHED AT:\t" + now + "\t#####\n")
    else:
        final_evm = grs.genename_evm(args.upgrade, args.verbose, evm_output_dir, dict_ref_name, args.upgrade)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n'))
        round_n += 1
        final_output = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb, ref_rename, trinity_out,
                                            final_evm, args.verbose)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(('###RUNNING iASSEMBLER\t' + now + '\t###\n'))

    if not long_sorted_bam:
        #print("line 430")
        long_fasta, stranded_value_new = mseq.filterLongReads(long_reads, args.assembly_overlap_length, args.max_long_read, gmap_wd,
                                          adapter_value, threads_use, args.adapter_match_score, ref_rename,
                                              args.max_intron_length, args.verbose, stranded_value)
        if stranded_value != stranded_value_new:
            stranded_value = stranded_value_new

        if args.minimap2:
            long_sam = mapping.minimap(ref_rename, long_fasta, threads_use, args.max_intron_length, gmap_wd, args.verbose)
        else:
            long_sam = mapping.gmap('sam', ref_rename, long_fasta, threads_use, 'samse',
                                    args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                    args.verbose, Fflag=False)
        long_sorted_bam = mapping.sam_to_sorted_bam(long_sam, threads_use, wd, args.verbose)
        sam_orig_id = mapping.change_chr(long_sorted_bam, dict_ref_name, gmap_wd, threads_use, args.verbose, "long")
        final_files.append(sam_orig_id)

    # HERE WE MERGE THE GMAP OUTPUT WITH THE EVM OUTPUT TO HAVE ONE            # FILE
    # HERE WE CHECK IF WE HAVE THE PASA UPDATED FILE OR THE EVM
    # ORIGINAL FILE
    mergedmap_gff3 = logistic.catTwoBeds(long_sorted_bam, final_evm, args.verbose, consensus_wd)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("\t###GFFREAD\t" + now + "\t###\n"))

    # HERE WE TRANSFORM THE COODINATES INTO SEQUENCES USING THE
    # REFERENCE
    gffread_fasta_file = consensus.gffread(mergedmap_gff3, ref_rename, consensus_wd, args.verbose)
    # HERE WE STORE THE SEQUENCE IN A DICTIONARY

    gffread_dict = consensus.fasta2Dict(gffread_fasta_file)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("\t#CLUSTERING\t" + now + "\t###\n"))

    # HERE WE CLUSTER THE SEQUENCES BASED ON THE GENOME POSITION
    cluster_list = consensus.cluster_pipeline(mergedmap_gff3, stranded_value, args.verbose)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("\t#CONSENSUS FOR EACH CLUSTER\t" + now + "\t###\n"))
    # HERE WE MAKE CONSENSUS FOR EACH CLUSTER
    tmp_wd = consensus_wd + 'tmp/'
    logistic.check_create_dir(tmp_wd)
    tmp_assembly_file = tmp_wd + 'assembly.fasta'
    if os.path.isfile(tmp_assembly_file):
        sys.stdout.write('No assembly')
    else:
        consensus.generate_fasta(cluster_list, gffread_dict, args.cluster_min_evidence,
                                 args.cluster_max_evidence, args.assembly_overlap_length, stranded_value, tmp_wd)
        consensus.assembly(args.assembly_overlap_length, args.assembly_percent_identity, threads_use, tmp_wd,
                           args.verbose)
        utrs.lengthSupport(tmp_wd, threads_use)

    # WITH THE ELSE, WE ALLOW THE USER TO DECIDE TO CHANGE THE ASSEMBLY
    # PARAMETERS AND COLLECT DIFFERENT ASSEMBLED SEQUENCES WITHOT RUNNING
    # THE FULL PIPELINE
    # HERE WE COLLECT THE ASSEMBLED SEQUENCES. WE COLLECT ONLY SEQUENCE
    # THAT PASS THE FILTER
    tmp_consensus = os.path.join(consensus_wd , 'tmp/')
    collect.parse_only(args.assembly_read_threshold, tmp_consensus, args.verbose)
    tmp_assembly = collect.cat_assembled(tmp_consensus)
    tmp_assembly_all = collect.cat_assembled_all(tmp_consensus)
    # HERE WE COLLECT THE NEW ASSEMBLED SEQUENCES AND WE COLLECT THE OLD
    # EVM DATA
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("###MAPPING CONSENSUS ASSEMBLIES\t" + now + "\t###\n"))
    # HERE WE MAP ALL THE FASTA FILES TO THE GENOME USING GMAP
    consensus_mapped_gff3 = mapping.gmap('cons', ref_rename, tmp_assembly, threads_use, 'gff3_gene',
                                         args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                         args.verbose, Fflag=True)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("###GETTING THE STRAND RIGHT\t" + now + "\t###\n"))
    merged_gff3 = collect.add_EVM(final_output, gmap_wd, consensus_mapped_gff3)
    #print(merged_gff3)
    update2 = grs.exonerate(ref_rename, merged_gff3, threads_use, exonerate_wd, args.verbose)
    print(ref_rename, update2)
    update3_1 = grs.remove_redudant(ref_rename, update2)
    print(update3_1)
    update3 = grs.genename_lorean(update3_1, args.verbose, exonerate_wd)
    print(update3)
    # HERE WE COMBINE TRINITY OUTPUT AND THE ASSEMBLY OUTPUT TO RUN AGAIN
    # PASA TO CORRECT SMALL ERRORS
    sys.stdout.write(("###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))
    fasta_all = logistic.cat_two_fasta(trinity_out, tmp_assembly_all, long_fasta, pasa_dir)
    round_n += 1
    update5 = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb,  ref_rename, fasta_all,
                                   update3, args.verbose)
    if args.verbose:
        sys.stdout.write(update5)
    round_n += 1
    update6 = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb,  ref_rename, fasta_all,
                                   update5, args.verbose)
    if args.verbose:
        sys.stdout.write(update6)
    final_update_update = grs.genename_last(update6, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "lorean")
    final_files.append(final_update_update)
    final_update_stats = evmPipeline.gff3_stats(final_update_update, pasa_dir)
    final_files.append(final_update_stats)
    if "command" not in (iprscan_log.decode("utf-8")) and args.interproscan:
        annot, bad_models = iprscan.iprscan(masked_ref, final_update_update, interproscan_out_dir, args.threads)
        final_files.append(annot)
        final_files.append(bad_models)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(('###CREATING OUTPUT DIRECTORY\t' + now + '\t###\n'))
    final_output_dir = os.path.join(output_dir,  args.out_dir + '_output')
    logistic.check_create_dir(final_output_dir)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("##PLACING OUTPUT FILES IN OUTPUT DIRECTORY\t" + now + "\t###\n"))
    for filename in final_files:
        if os.path.exists(filename):
            logistic.copy_file(filename, final_output_dir)
            cmdstring = "chmod -R 775 %s" % wd
            os.system(cmdstring)
    sys.exit("##### LOREAN FINISHED HERE. GOOD BYE. #####\n")
Пример #8
0
def upgrade():
    '''Core of the program'''

    args = arguments.setting()
    fasta = (".fasta", ".fa", ".fas", ".fsta")
    fastq = (".fastq", ".fq")
    fmtdate = '%H:%M:%S %d-%m'
    root = os.getcwd()
    output_dir = os.path.join(root, "LoReAn_" + args.working_dir)
    logistic.check_create_dir(output_dir)
    wd = os.path.join(output_dir, "run/")
    if args.keep_tmp:
        logistic.check_create_dir(wd)
    elif not os.path.exists(wd) and args.verbose:
        logistic.check_create_dir(wd)
    else:
        temp_dir = tempfile.TemporaryDirectory(prefix='run_', dir=output_dir, suffix="/", )
        wd = temp_dir.name

    ref_orig = os.path.abspath(args.reference)
    ref = os.path.join(wd, args.reference)
    if not os.path.exists(ref):
        os.link(ref_orig, ref)

    max_threads = multiprocessing.cpu_count()
    if int(args.threads) > max_threads:
        threads_use = str(max_threads)
        sys.stdout.write(('\n### MAX NUMBER OF USED THREADS IS ' + str(max_threads) + ' AND NOT ' + args.threads + ' AS SET ###\n'))
    else:
        threads_use = args.threads

    final_files = []  # STORE THE IMPORTANT OUTPUT FILES

    logistic.check_create_dir(wd)
    logistic.check_file(ref)

    gmap_wd = wd + '/gmap_output/'
    exonerate_wd = wd + '/exonerate/'
    pasa_dir = wd + 'PASA/'
    star_out = wd + '/STAR/'
    trin_dir = wd + '/Trinity/'

    logistic.check_create_dir(trin_dir)
    logistic.check_create_dir(star_out)
    logistic.check_create_dir(pasa_dir)
    logistic.check_create_dir(gmap_wd)
    logistic.check_create_dir(exonerate_wd)
    if args.long_reads:
        consensus_wd = (wd + '/consensus/')
        logistic.check_create_dir(consensus_wd)

    logistic.check_gmap(threads_use, 'samse', args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd, args.verbose)

    if args.repeat_masked:
        genome_gmap = mseq.maskedgenome(gmap_wd, ref, args.repeat_masked)
    else:
        genome_gmap = ref

    if args.short_reads or args.long_reads:
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('\n###STAR MAPPING  STARTED AT:\t' + now + '\t###\n'))
        if args.short_reads.endswith(fastq):
            if ',' in args.short_reads:
                pairedEndFiles = args.short_reads.split(',')
                short_1 = os.path.abspath(pairedEndFiles[0])
                short_2 = os.path.abspath(pairedEndFiles[1])
                short_reads_file = [short_1, short_2]
            else:
                short_reads_file = os.path.abspath(args.short_reads)
            short_bam = mapping.star(ref, short_reads_file, threads_use, args.max_intron_length, star_out,
                                     args.verbose)
            short_sorted_bam = mapping.samtools_sort(short_bam, threads_use, wd, args.verbose)
            final_files.append(short_sorted_bam)
        # BAM SORTED FILES GET IN HERE
        elif args.short_reads.endswith("bam"):
            logistic.check_create_dir(star_out)
            short_sorted_bam = os.path.abspath(args.short_reads)
            bam_file = short_sorted_bam.split("/")
            short_bam = star_out + "/" + bam_file[-1]
            if not os.path.exists(ref):
                os.link(short_sorted_bam, short_bam)
        else:
            short_sorted_bam = False
            sys.stdout.write('No short reads file')
        if args.long_reads.endswith(fastq) or args.long_reads.endswith(fasta):
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(("\n###FILTERING OUT LONG READS STARTED AT:\t" + now + "\t###\n"))
            long_fasta = mseq.filterLongReads(args.long_reads, args.assembly_overlap_length,
                                                            args.max_long_read, gmap_wd, args.adapter, threads_use,
                                                            a=True)
            if not short_sorted_bam:
                # If short reads have been mapped dont do it
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(('\n###GMAP\t' + now + 't###\n'))
                long_sam = mapping.gmap('sam', genome_gmap, long_fasta, threads_use, 'samse',
                                        args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                        args.verbose, Fflag=False)
                long_sorted_bam = mapping.sam_to_sorted_bam(long_sam, threads_use, wd, args.verbose)
                final_files.append(long_sorted_bam)
            else:
                long_sorted_bam = False
        else:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('\n###NO LONG READS FILE\t' + now + '\t###\n'))
            long_sorted_bam = False
        if short_sorted_bam:  # If there are short reads, these will serve to the transcript assembly pipeline
            default_bam = short_sorted_bam
        else:
            default_bam = long_sorted_bam
        # TRANSCRIPT ASSEMBLY
        # TRINITY
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('\n###TRINITY STARTS AT:\t' + now + '\t###\n'))
        if int(threads_use) > 1:
            trinity_cpu = int(int(threads_use) / int(2))
        else:
            trinity_cpu = int(threads_use)
        trinity_out = transcripts.trinity(default_bam, trin_dir, args.max_intron_length, trinity_cpu, args.verbose)

    else:
        sys.exit("### NO READS TO USE ###")

    if args.long_reads:
        if not long_sorted_bam:
            long_sam = mapping.gmap('sam', genome_gmap, long_fasta, threads_use, 'samse',
                                    args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                    args.verbose, Fflag=False)
            long_sorted_bam = mapping.sam_to_sorted_bam(long_sam, threads_use, wd, args.verbose)
            final_files.append(long_sorted_bam)
        file_name = consensus_wd + 'mergedGmapEvm.beforeAssembly.gff3'

        mergedmapGFF3 = logistic.catTwoBeds(long_sorted_bam, args.upgrade, file_name, args.verbose)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n\t###GFFREAD\t" + now + "\t###\n"))

        gffread_fasta_file = consensus.gffread(mergedmapGFF3, ref, consensus_wd, args.verbose)
        # HERE WE STORE THE SEQUENCE IN A DICTIONARY
        fake = []
        long_fasta = mseq.filterLongReads(gffread_fasta_file, args.assembly_overlap_length,
                                                        args.max_long_read, consensus_wd, fake, threads_use,
                                                        a=False)

        gffreadDict = consensus.fasta2Dict(gffread_fasta_file)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n\t#CLUSTERING\t" + now + "\t###\n"))

        cluster_list = consensus.cluster_pipeline(mergedmapGFF3, args.assembly_overlap_length, args.stranded, args.verbose)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n\t#CONSENSUS FOR EACH CLUSTER\t" + now + "\t###\n"))
        tmp_wd = consensus_wd + 'tmp/'
        logistic.check_create_dir(tmp_wd)
        tmp_assembly_file = tmp_wd + 'assembly.fasta'
        if os.path.isfile(tmp_assembly_file):
            sys.stdout.write('No assembly')
        else:
            consensus.generate_fasta(cluster_list, gffreadDict, args.cluster_min_evidence,
                                     args.cluster_max_evidence, args.assembly_overlap_length, tmp_wd)
            consensus.assembly(args.assembly_overlap_length, args.assembly_percent_identity, threads_use, tmp_wd,
                               args.verbose)
            utrs.lengthSupport(tmp_wd, threads_use)

        tmp_consensus = os.path.join(consensus_wd , 'tmp/')
        collect.parse_only(args.assembly_read_threshold, tmp_consensus, args.verbose)
        tmp_assembly = collect.cat_assembled(tmp_consensus)
        tmp_assembly_all = collect.cat_assembled_all(tmp_consensus)

        merged_fasta_filename = consensus_wd + 'assembly.wEVM.fasta'
        collect.add_EVM(gffread_fasta_file, tmp_assembly, merged_fasta_filename)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n###MAPPING CONSENSUS ASSEMBLIES\t" + now + "\t###\n"))
        consensus_mapped_gff3 = mapping.gmap('cons', genome_gmap, merged_fasta_filename, threads_use, 'gff3_gene',
                                             args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                             args.verbose, Fflag=True)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n###GETTING THE STRAND RIGHT\t" + now + "\t###\n"))
        strand_mapped_gff3 = grs.strand(args.upgrade, consensus_mapped_gff3, ref, threads_use, gmap_wd, args.verbose)
        gff_pasa = grs.appendID(strand_mapped_gff3)
        no_overl = grs.removeOverlap(gff_pasa, args.verbose)
        no_disc = grs.removeDiscrepancy(no_overl, args.upgrade, args.verbose)
        uniq_gene = grs.newNames(no_disc)

        finalupdate3 = grs.genename(uniq_gene, args.prefix_gene, args.verbose)
        print(("\n###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))
        finalupdate4 = grs.exonerate(ref, finalupdate3, threads_use, exonerate_wd, args.verbose)
        finalupdate5 = grs.genename(finalupdate4, args.prefix_gene, args.verbose)

        # HERE WE COMBINE TRINITY OUTPUT AND THE ASSEMBLY OUTPUT TO RUN AGAIN
        # PASA TO CORRECT SMALL ERRORS

        sys.stdout.write(("\n###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))
        round_n = 0
        fasta_all = logistic.cat_two_fasta(trinity_out, tmp_assembly_all, long_fasta, pasa_dir)
        round_n += 1
        pasa.create_pasa_database(pasa_dir, args.pasa_db, args.verbose)
        #align_pasa_conf = pasa.pasa_configuration(pasa_dir, args.pasa_db, args.verbose)
        finalupdate = pasa.update_database(threads_use, str(round_n), pasa_dir, args.pasa_db, ref,
                                           fasta_all, finalupdate5, args.verbose)
        round_n += 1
        finalupdate2 = pasa.update_database(threads_use, str(round_n), pasa_dir, args.pasa_db, ref,
                                            fasta_all, finalupdate, args.verbose)
        final_update = grs.genename(finalupdate2, args.prefix_gene, args.verbose)

        final_files.append(final_update)

        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('\n###CREATING OUTPUT DIRECTORY\t' + now + '\t###\n'))
        final_output_dir = os.path.join(output_dir,  args.species + '_output' )
        logistic.check_create_dir(final_output_dir)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n##PLACING OUTPUT FILES IN OUTPUT DIRECTORY\t" + now + "\t###\n"))

        for filename in final_files:
            if filename != '':
                logistic.copy_file(filename, final_output_dir)
                cmdstring = "chmod -R 775 %s" % wd
                os.system(cmdstring)
        if not args.keep_tmp:
            temp_dir.cleanup()

    sys.exit("##### LOREAN FINISHED HERE. GOOD BYE. #####\n")
Пример #9
0
def main():

    home = os.path.expanduser("~")
    args = arguments.setting()

    if args.upgrade:
        update.upgrade()
    elif os.path.isfile(home + "/.gm_key") and args.proteins != "":
        fasta = (".fasta", ".fa", ".fas", ".fsta")
        fastq = (".fastq", ".fq")
        '''Core of the program'''
        # Parse the arguments

        fmtdate = '%H:%M:%S %d-%m'
        now = datetime.datetime.now().strftime(fmtdate)
        # Useful variables for later
        root = os.getcwd()

        output_dir = os.path.join(root, "LoReAn_" + args.working_dir)
        logistic.check_create_dir(output_dir)

        wd = os.path.join(output_dir, "run/")
        if args.keep_tmp:
            logistic.check_create_dir(wd)
        elif not os.path.exists(wd) and args.verbose:
            logistic.check_create_dir(wd)
        else:
            temp_dir = tempfile.TemporaryDirectory(
                prefix='run_',
                dir=output_dir,
                suffix="/",
            )
            wd = temp_dir.name

        ref_orig = os.path.abspath(args.reference)
        ref = os.path.join(wd, args.reference)
        if not os.path.exists(ref):
            os.link(ref_orig, ref)

        max_threads = multiprocessing.cpu_count()
        if int(args.threads) > max_threads:
            threads_use = str(max_threads)
            sys.stdout.write(
                ('\n### MAX NUMBER OF USED THREADS IS ' + str(max_threads) +
                 ' AND NOT ' + args.threads + ' AS SET ###\n'))
        else:
            threads_use = args.threads

        gmap_name = args.reference + '_GMAPindex'
        pasa_name = 'assembler-' + args.pasa_db

        if args.external:
            external_file = args.external
        else:
            external_file = ''

        if args.short_reads == '' and args.long_reads == '':
            if external_file.endswith("gff3") or external_file.endswith(fasta):
                weights_dic = {
                    'Augustus': args.augustus_weigth,
                    'GeneMark.hmm': args.genemark_weigth,
                    'AAT': args.AAT_weigth,
                    'external': args.external_weigth
                }
            else:
                weights_dic = {
                    'Augustus': args.augustus_weigth,
                    'GeneMark.hmm': args.genemark_weigth,
                    'AAT': args.AAT_weigth
                }
        elif args.short_reads != '' or args.long_reads != '':
            if external_file.endswith("gff3") or external_file.endswith(fasta):
                weights_dic = {
                    'Augustus': args.augustus_weigth,
                    pasa_name: args.pasa_weigth,
                    'GeneMark.hmm': args.genemark_weigth,
                    'AAT': args.AAT_weigth,
                    gmap_name: args.trinity_weigth,
                    'external': args.external_weigth
                }
            else:
                weights_dic = {
                    'Augustus': args.augustus_weigth,
                    pasa_name: args.pasa_weigth,
                    'GeneMark.hmm': args.genemark_weigth,
                    'AAT': args.AAT_weigth,
                    gmap_name: args.trinity_weigth
                }

        final_files = []  # STORE THE IMPORTANT OUTPUT FILES

        logistic.check_create_dir(wd)
        logistic.check_file(ref)

        gmap_wd = wd + '/gmap_output/'
        exonerate_wd = wd + '/exonerate/'
        pasa_dir = wd + 'PASA/'
        star_out = wd + '/STAR/'
        trin_dir = wd + '/Trinity/'
        evm_inputs_dir = wd + '/evm_inputs/'
        braker_folder = wd + '/braker/'
        evm_output_dir = wd + '/evm_output/'

        logistic.check_create_dir(evm_inputs_dir)
        logistic.check_create_dir(evm_output_dir)
        logistic.check_create_dir(trin_dir)
        logistic.check_create_dir(star_out)
        logistic.check_create_dir(pasa_dir)
        logistic.check_create_dir(gmap_wd)
        logistic.check_create_dir(exonerate_wd)
        if args.long_reads:
            consensus_wd = (wd + '/consensus/')
            logistic.check_create_dir(consensus_wd)

        logistic.check_gmap(threads_use, 'samse', args.min_intron_length,
                            args.max_intron_length, args.end_exon, gmap_wd,
                            args.verbose)

        check_species = 'augustus --species=help'
        process = subprocess.Popen(check_species,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=True)
        out_augustus, err_augustus = process.communicate()
        list_file = [
            os.path.join(home, o) for o in os.listdir(home)
            if os.path.isfile(os.path.join(home, o)) and ".bashrc" == o
        ]
        with open(list_file[0]) as bashrc:
            for path in bashrc:
                if "AUGUSTUS_CONFIG_PATH" in path:
                    augustus_specie_dir = path.split("=~")[1].rsplit()[0]
                    augustus_species = [
                        d for d in os.listdir(home + augustus_specie_dir +
                                              "species")
                    ]
        protein_loc = os.path.abspath(args.proteins)

        if args.repeat_masked:
            genome_gmap = mseq.maskedgenome(gmap_wd, ref, args.repeat_masked)
        else:
            genome_gmap = ref

        # COLLECT ONLY ONLY RUNS PART OF THE CONSENSUS PIPELINE
        list_fasta_names = multiple.single_fasta(ref, wd)
        if args.short_reads or args.long_reads:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(
                ('\n###STAR MAPPING  STARTED AT:\t' + now + '\t###\n'))
            # SHORT READS
            if args.short_reads.endswith(fastq):
                if ',' in args.short_reads:
                    pairedEndFiles = args.short_reads.split(',')
                    short_1 = os.path.abspath(pairedEndFiles[0])
                    short_2 = os.path.abspath(pairedEndFiles[1])
                    short_reads_file = [short_1, short_2]
                else:
                    short_reads_file = os.path.abspath(args.short_reads)
                # Map with STAR
                short_bam = mapping.star(ref, short_reads_file, threads_use,
                                         args.max_intron_length, star_out,
                                         args.verbose)
                short_sorted_bam = mapping.samtools_sort(
                    short_bam, threads_use, wd, args.verbose)
                # Keep the output
                final_files.append(short_sorted_bam)
            # BAM SORTED FILES GET IN HERE
            elif args.short_reads.endswith("bam"):
                logistic.check_create_dir(star_out)
                short_sorted_bam = os.path.abspath(args.short_reads)
                bam_file = short_sorted_bam.split("/")
                short_bam = star_out + "/" + bam_file[-1]
                if not os.path.exists(ref):
                    os.link(short_sorted_bam, short_bam)

            else:
                short_sorted_bam = False
                sys.stdout.write("\n\033[31m ### NO SHORT READS ### \033[0m\n")

            # LONG READS
            if 'fastq' in args.long_reads or 'fq' in args.long_reads or 'fasta' in args.long_reads or 'fa' in args.long_reads:
                # with this operation, reads are filtered for their length.
                # Nanopore reads can be chimaras or sequencing artefacts.
                # filtering on length reduces the amount of sequencing
                # artefacts
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(
                    ("\n###FILTERING OUT LONG READS STARTED AT:\t" + now +
                     "\t###\n"))
                long_fasta = mseq.filterLongReads(args.long_reads,
                                                  args.assembly_overlap_length,
                                                  args.max_long_read,
                                                  gmap_wd,
                                                  args.adapter,
                                                  threads_use,
                                                  a=True)
                if not short_sorted_bam:
                    # If short reads have been mapped dont do it
                    now = datetime.datetime.now().strftime(fmtdate)
                    sys.stdout.write(('\n###GMAP\t' + now + 't###\n'))
                    long_sam = mapping.gmap('sam',
                                            genome_gmap,
                                            long_fasta,
                                            threads_use,
                                            'samse',
                                            args.min_intron_length,
                                            args.max_intron_length,
                                            args.end_exon,
                                            gmap_wd,
                                            args.verbose,
                                            Fflag=False)
                    # Convert to sorted BAM
                    long_sorted_bam = mapping.sam_to_sorted_bam(
                        long_sam, threads_use, wd, args.verbose)

                    # Keep the output
                    final_files.append(long_sorted_bam)
                else:
                    long_sorted_bam = False

            else:
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(
                    ('\n###NO LONG READS FILE\t' + now + '\t###\n'))
                long_sorted_bam = False
            if short_sorted_bam:  # If there are short reads, these will serve to the transcript assembly pipeline
                default_bam = short_sorted_bam
            else:
                default_bam = long_sorted_bam
            # TRANSCRIPT ASSEMBLY
            # TRINITY
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('\n###TRINITY STARTS AT:\t' + now + '\t###\n'))
            if int(threads_use) > 1:
                trinity_cpu = int(int(threads_use) / int(2))
            else:
                trinity_cpu = int(threads_use)
            trinity_out = transcripts.trinity(default_bam, trin_dir,
                                              args.max_intron_length,
                                              trinity_cpu, args.verbose)
            trinity_gff3 = mapping.gmap('trin',
                                        genome_gmap,
                                        trinity_out,
                                        threads_use,
                                        'gff3_gene',
                                        args.min_intron_length,
                                        args.max_intron_length,
                                        args.end_exon,
                                        gmap_wd,
                                        args.verbose,
                                        Fflag=True)
            trinity_path = trinity_gff3

            # PASA Pipeline
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('\n###PASA STARTS AT:\t' + now + '\t###\n'))
            # Create PASA folder and configuration file
            #align_pasa_conf = pasa.pasa_configuration(pasa_dir, args.pasa_db, args.verbose)
            # Launch PASA
            pasa_gff3 = pasa.pasa_call(pasa_dir, args.pasa_db, ref,
                                       trinity_out, args.max_intron_length,
                                       threads_use, args.verbose)

            # HERE WE PARALLELIZE PROCESSES WHEN MULTIPLE THREADS ARE USED
            if args.species in (err_augustus.decode("utf-8")
                                ) or args.species in augustus_species:
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(
                    ('\n###AUGUSTUS, GENEMARK-ES AND AAT STARTED AT:' + now +
                     '\t###\n'))
                queue = Queue()
                for software in range(3):
                    queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                    for software in range(3):
                        t = Thread(target=handler.august_gmes_aat,
                                   args=(queue, ref, args.species, protein_loc,
                                         threads_use, args.fungus,
                                         list_fasta_names, wd, args.verbose))
                        t.daemon = True
                        t.start()
                queue.join()
                augustus_file = wd + 'augustus/augustus.gff'
                augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                genemark_file = wd + 'gmes/genemark.gtf'
                genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                merged_prot_gff3 = wd + 'AAT/protein_evidence.gff3'

            elif args.short_reads:  # USING PROTEINS AND SHORT READS
                logistic.check_create_dir(braker_folder)
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(
                    ('\n###BRAKER1 (USING SHORT READS) AND AAT STARTED AT:\t' +
                     now + '\t###\n'))
                queue = Queue()
                for software in range(2):
                    queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                    for software in range(2):
                        t = Thread(target=handler.braker_aat,
                                   args=(queue, ref, default_bam, args.species,
                                         protein_loc, threads_use, args.fungus,
                                         list_fasta_names, wd, braker_folder,
                                         args.verbose))
                        t.daemon = True
                        t.start()
                queue.join()
                augustus_file = braker_folder + 'augustus.gff'
                augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                genemark_file = braker_folder + 'GeneMark-ET/genemark.gtf'
                genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                merged_prot_gff3 = wd + 'AAT/protein_evidence.gff3'

            else:  # USING PROTEINS AND LONG READS
                queue = Queue()
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(
                    ('\n###BRAKER1 (USING LONG READS) AND AAT STARTED AT: \t' +
                     now + '\t###\n'))
                logistic.check_create_dir(braker_folder)
                for software in range(2):
                    queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                    for software in range(2):
                        t = Thread(target=handler.braker_aat,
                                   args=(queue, ref, long_sorted_bam,
                                         args.species, protein_loc,
                                         threads_use, args.fungus,
                                         list_fasta_names, wd, braker_folder,
                                         args.verbose))
                        t.daemon = True
                        t.start()
                queue.join()
                augustus_file = braker_folder + 'augustus.gff'
                augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                genemark_file = braker_folder + 'GeneMark-ET/genemark.gtf'
                genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                merged_prot_gff3 = wd + 'AAT/protein_evidence.gff3'
        elif args.species in (err_augustus.decode("utf-8")
                              ) or args.species in augustus_species:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(
                ('\n###AUGUSTUS, GENEMARK-ES AND AAT STARTED AT:' + now +
                 '\t###\n'))
            queue = Queue()
            for software in range(3):
                queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                for software in range(3):
                    t = Thread(target=handler.august_gmes_aat,
                               args=(queue, ref, args.species, protein_loc,
                                     threads_use, args.fungus,
                                     list_fasta_names, wd, args.verbose))
                    t.daemon = True
                    t.start()
            queue.join()
            augustus_file = wd + 'augustus/augustus.gff'
            augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
            genemark_file = wd + 'gmes/genemark.gtf'
            genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
            merged_prot_gff3 = wd + 'AAT/protein_evidence.gff3'
        else:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.exit("#####UNRECOGNIZED SPECIES FOR AUGUSTUS AND NO READS\t" +
                     now + "\t#####\n")
        # Prepare EVM input files
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('\n###EVM STARTED AT:\t' + now + '\t###\n'))
        # HERE WE CONVERT FILES FOR EVM AND PLACE THEM IN INPUT FOLDER

        if not args.short_reads and not args.long_reads:
            if external_file:
                if external_file.endswith(fasta):
                    external_file_gff3 = mapping.gmap('ext',
                                                      genome_gmap,
                                                      external_file,
                                                      threads_use,
                                                      'gff3_gene',
                                                      args.min_intron_length,
                                                      args.max_intron_length,
                                                      args.end_exon,
                                                      gmap_wd,
                                                      args.verbose,
                                                      Fflag=True)
                    external_file_changed = update.external(
                        external_file_gff3, gmap_wd, args.verbose)
                elif external_file.endswith("gff3"):
                    external_file_changed = update.external(
                        external_file, gmap_wd, args.verbose)
                evm_inputs = {
                    'augustus': augustus_gff3,
                    'genemark': genemark_gff3,
                    'AAT': merged_prot_gff3,
                    'external': external_file_changed
                }
            else:
                evm_inputs = {
                    'augustus': augustus_gff3,
                    'genemark': genemark_gff3,
                    'AAT': merged_prot_gff3
                }
        elif args.short_reads or args.long_reads:
            if args.external:
                external_file = args.external
                if external_file.endswith(fasta):
                    external_file_gff3 = mapping.gmap('ext',
                                                      genome_gmap,
                                                      external_file,
                                                      threads_use,
                                                      'gff3_gene',
                                                      args.min_intron_length,
                                                      args.max_intron_length,
                                                      args.end_exon,
                                                      gmap_wd,
                                                      args.verbose,
                                                      Fflag=True)
                    external_file_changed = update.external(
                        external_file_gff3, gmap_wd, args.verbose)
                elif external_file.endswith("gff3"):
                    external_file_changed = update.external(
                        external_file, gmap_wd, args.verbose)
                evm_inputs = {
                    'pasa': pasa_gff3,
                    'augustus': augustus_gff3,
                    'genemark': genemark_gff3,
                    'AAT': merged_prot_gff3,
                    'gmap': trinity_path,
                    'external': external_file_changed
                }
            else:
                evm_inputs = {
                    'pasa': pasa_gff3,
                    'augustus': augustus_gff3,
                    'genemark': genemark_gff3,
                    'AAT': merged_prot_gff3,
                    'gmap': trinity_path
                }

        # HERE WE RUN EVM; WE PREPARE FILES THAT ARE REQUIRED BY EVM LIKE
        # WEIGTH TABLE

        list_soft, pred_file, transcript_file, protein_file = inputEvm.group_EVM_inputs(
            evm_inputs_dir, evm_inputs)
        weight_file = inputEvm.evm_weight(evm_inputs_dir, weights_dic,
                                          list_soft, pasa_name, gmap_name)
        # EVM PIPELINE

        if args.short_reads or args.long_reads:  # WE HAVE SHORT READS AND PROTEINS
            evm_gff3, gff3_stat_file = evmPipeline.evm_pipeline(
                evm_output_dir, threads_use, genome_gmap, weight_file,
                pred_file, transcript_file, protein_file, args.segmentSize,
                args.overlap_size, args.verbose)
        elif not args.short_reads and not args.long_reads:  # WE HAVE PROTEINS BUT NOT SHORT READS
            transcript_file = ''
            evm_gff3, gff3_stat_file = evmPipeline.evm_pipeline(
                evm_output_dir, threads_use, genome_gmap, weight_file,
                pred_file, transcript_file, protein_file, args.segmentSize,
                args.overlap_size, args.verbose)
        # KEEP THIS OUTPUT
        final_files.append(evm_gff3)
        final_files.append(gff3_stat_file)

        round_n = 1

        if not args.short_reads and not args.long_reads:
            last_gff3 = grs.newNames(evm_gff3)
            #score_gff3 = score.score(last_gff3, evm_inputs)
            now = datetime.datetime.now().strftime(fmtdate)
            sys.exit("##### EVM FINISHED AT:\t" + now + "\t#####\n")

        else:
            #if args.short_reads and not args.long_reads:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('\n###UPDATE WITH PASA DATABASE STARTED AT:\t ' +
                              now + '\t###\n'))
            round_n += 1
            finalOutput = pasa.update_database(threads_use, str(round_n),
                                               pasa_dir, args.pasa_db, ref,
                                               trinity_out, evm_gff3,
                                               args.verbose)
            final_update = grs.genename(finalOutput, args.prefix_gene,
                                        args.verbose)
            updatedGff3 = grs.newNames(final_update)
            #score_gff3 = score.score(updatedGff3, evm_inputs)
            final_files.append(updatedGff3)
        #else:
        #updatedGff3 = evm_gff3

        #score_gff3 = score.score(evm_gff3, evm_inputs)

        if args.long_reads == '':
            final_output_dir = wd + 'output/'
            logistic.check_create_dir(final_output_dir)
            for filename in final_files:
                if filename != '':
                    logistic.copy_file(filename, final_output_dir)
            cmdstring = "chmod -R 775 %s" % wd
            os.system(cmdstring)
            now = datetime.datetime.now().strftime(fmtdate)
            sys.exit("#####LOREAN FINISHED WITHOUT USING LONG READS\t" + now +
                     "\t. GOOD BYE.#####\n")

        else:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('\n###RUNNING iASSEMBLER\t' + now + '\t###\n'))

            if args.long_reads:
                # Means there are long reads to map and user wants to run
                # this pipeline
                if not long_sorted_bam:
                    long_sam = mapping.gmap('sam',
                                            genome_gmap,
                                            long_fasta,
                                            threads_use,
                                            'samse',
                                            args.min_intron_length,
                                            args.max_intron_length,
                                            args.end_exon,
                                            gmap_wd,
                                            args.verbose,
                                            Fflag=False)
                    long_sorted_bam = mapping.sam_to_sorted_bam(
                        long_sam, threads_use, wd, args.verbose)
                    final_files.append(long_sorted_bam)

                    # HERE WE MERGE THE GMAP OUTPUT WITH THE EVM OUTPUT TO HAVE ONE
                    # FILE
                fileName = consensus_wd + 'mergedGmapEvm.beforeAssembly.gff3'
                # HERE WE CHECK IF WE HAVE THE PASA UPDATED FILE OR THE EVM
                # ORIGINAL FILE
                if os.path.isfile(updatedGff3):
                    # HERE WE MERGE THE TWO FILES
                    mergedmapGFF3 = logistic.catTwoBeds(
                        long_sorted_bam, updatedGff3, fileName, args.verbose)
                else:
                    mergedmapGFF3 = logistic.catTwoBeds(
                        long_sorted_bam, evm_gff3, fileName, args.verbose)
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(("\n\t###GFFREAD\t" + now + "\t###\n"))

                # HERE WE TRANSFORM THE COODINATES INTO SEQUENCES USING THE
                # REFERENCE
                gffread_fasta_file = consensus.gffread(mergedmapGFF3, ref,
                                                       consensus_wd,
                                                       args.verbose)
                # HERE WE STORE THE SEQUENCE IN A DICTIONARY
                fake = []
                long_fasta = mseq.filterLongReads(gffread_fasta_file,
                                                  args.assembly_overlap_length,
                                                  args.max_long_read,
                                                  consensus_wd,
                                                  fake,
                                                  threads_use,
                                                  a=False)

                gffreadDict = consensus.fasta2Dict(gffread_fasta_file)
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(("\n\t#CLUSTERING\t" + now + "\t###\n"))

                # HERE WE CLUSTER THE SEQUENCES BASED ON THE GENOME
                # POSITION
                cluster_list = consensus.cluster_pipeline(
                    mergedmapGFF3, args.assembly_overlap_length, args.stranded,
                    args.verbose)
                now = datetime.datetime.now().strftime(fmtdate)

                sys.stdout.write(
                    ("\n\t#CONSENSUS FOR EACH CLUSTER\t" + now + "\t###\n"))

                # HERE WE MAKE CONSENSUS FOR EACH CLUSTER
                tmp_wd = consensus_wd + 'tmp/'
                logistic.check_create_dir(tmp_wd)
                tmp_assembly_file = tmp_wd + 'assembly.fasta'
                if os.path.isfile(tmp_assembly_file):
                    sys.stdout.write('No assembly')
                else:
                    consensus.generate_fasta(cluster_list, gffreadDict,
                                             args.cluster_min_evidence,
                                             args.cluster_max_evidence,
                                             args.assembly_overlap_length,
                                             tmp_wd)
                    consensus.assembly(args.assembly_overlap_length,
                                       args.assembly_percent_identity,
                                       threads_use, tmp_wd, args.verbose)
                    utrs.lengthSupport(tmp_wd, threads_use)

        # WITH THE ELSE, WE ALLOW THE USER TO DECIDE TO CHANGE THE ASSEMBLY
        # PARAMETERS AND COLLECT DIFFERENT ASSEMBLED SEQUENCES WITHOT RUNNING
        # THE FULL PIPELINE
        # HERE WE COLLECT THE ASSEMBLED SEQUENCES. WE COLLECT ONLY SEQUENCE
        # THAT PASS THE FILTER
        tmp_consensus = os.path.join(consensus_wd, 'tmp/')
        collect.parse_only(args.assembly_read_threshold, tmp_consensus,
                           args.verbose)
        tmp_assembly = collect.cat_assembled(tmp_consensus)
        tmp_assembly_all = collect.cat_assembled_all(tmp_consensus)
        # HERE WE COLLECT THE NEW ASSEMBLED SEQUENCES AND WE COLLECT THE OLD
        # EVM DATA
        merged_fasta_filename = consensus_wd + 'assembly.wEVM.fasta'
        collect.add_EVM(gffread_fasta_file, tmp_assembly,
                        merged_fasta_filename)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(
            ("\n###MAPPING CONSENSUS ASSEMBLIES\t" + now + "\t###\n"))

        # HERE WE MAP ALL THE FASTA FILES TO THE GENOME USING GMAP
        consensus_mapped_gff3 = mapping.gmap('cons',
                                             genome_gmap,
                                             merged_fasta_filename,
                                             threads_use,
                                             'gff3_gene',
                                             args.min_intron_length,
                                             args.max_intron_length,
                                             args.end_exon,
                                             gmap_wd,
                                             args.verbose,
                                             Fflag=True)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n###GETTING THE STRAND RIGHT\t" + now + "\t###\n"))

        strand_mapped_gff3 = grs.strand(evm_gff3, consensus_mapped_gff3, ref,
                                        threads_use, gmap_wd, args.verbose)
        gff_pasa = grs.appendID(strand_mapped_gff3)
        no_overl = grs.removeOverlap(gff_pasa, args.verbose)
        no_disc = grs.removeDiscrepancy(no_overl, evm_gff3, args.verbose)
        uniq_gene = grs.newNames(no_disc)

        finalupdate3 = grs.genename(uniq_gene, args.prefix_gene, args.verbose)
        print(("\n###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))
        finalupdate4 = grs.exonerate(ref, finalupdate3, threads_use,
                                     exonerate_wd, args.verbose)
        finalupdate5 = grs.genename(finalupdate4, args.prefix_gene,
                                    args.verbose)

        # HERE WE COMBINE TRINITY OUTPUT AND THE ASSEMBLY OUTPUT TO RUN AGAIN
        # PASA TO CORRECT SMALL ERRORS

        sys.stdout.write(
            ("\n###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))

        fasta_all = logistic.cat_two_fasta(trinity_out, tmp_assembly_all,
                                           long_fasta, pasa_dir)
        round_n += 1

        finalupdate = pasa.update_database(threads_use, str(round_n), pasa_dir,
                                           args.pasa_db, ref, fasta_all,
                                           finalupdate5, args.verbose)
        round_n += 1
        finalupdate2 = pasa.update_database(threads_use, str(round_n),
                                            pasa_dir, args.pasa_db, ref,
                                            fasta_all, finalupdate,
                                            args.verbose)
        final_update = grs.genename(finalupdate2, args.prefix_gene,
                                    args.verbose)
        #score_gff3 = score.score(final_update, evm_inputs)

        final_files.append(final_update)

        final_update_stats = evmPipeline.gff3_stats(final_update, pasa_dir)
        final_files.append(final_update_stats)

        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(
            ('\n###CREATING OUTPUT DIRECTORY\t' + now + '\t###\n'))

        final_output_dir = os.path.join(output_dir, args.species + '_output')

        logistic.check_create_dir(final_output_dir)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n##PLACING OUTPUT FILES IN OUTPUT DIRECTORY\t" +
                          now + "\t###\n"))

        for filename in final_files:
            if filename != '':
                logistic.copy_file(filename, final_output_dir)
                cmdstring = "chmod -R 775 %s" % wd
                os.system(cmdstring)
        if not args.keep_tmp:
            temp_dir.cleanup()
        sys.exit("##### LOREAN FINISHED HERE. GOOD BYE. #####\n")
    else:
        sys.exit(
            "#####LOREAN STOPS HERE. CHECK THAT THE PROTEIN AND SPECIES OPTION HAVE BOTH AN ARGUMENT. CHECK THAT THE gm_key IS IN THE FOLDER#####\n"
        )
Пример #10
0
def upgrade():
    '''Core of the program'''

    args = arguments.setting()
    fasta = (".fasta", ".fa", ".fas", ".fsta")
    fastq = (".fastq", ".fq")
    fmtdate = '%H:%M:%S %d-%m'
    root = os.getcwd()
    output_dir = os.path.join(root, "LoReAn_" + args.working_dir)
    logistic.check_create_dir(output_dir)
    wd = os.path.join(output_dir, "run/")
    if args.keep_tmp:
        logistic.check_create_dir(wd)
    elif not os.path.exists(wd) and args.verbose:
        logistic.check_create_dir(wd)
    else:
        temp_dir = tempfile.TemporaryDirectory(
            prefix='run_',
            dir=output_dir,
            suffix="/",
        )
        wd = temp_dir.name

    ref_orig = os.path.abspath(args.reference)
    ref = os.path.join(wd, args.reference)
    if not os.path.exists(ref):
        os.link(ref_orig, ref)

    max_threads = multiprocessing.cpu_count()
    if int(args.threads) > max_threads:
        threads_use = str(max_threads)
        sys.stdout.write(
            ('\n### MAX NUMBER OF USED THREADS IS ' + str(max_threads) +
             ' AND NOT ' + args.threads + ' AS SET ###\n'))
    else:
        threads_use = args.threads

    final_files = []  # STORE THE IMPORTANT OUTPUT FILES

    logistic.check_create_dir(wd)
    logistic.check_file(ref)

    gmap_wd = wd + '/gmap_output/'
    exonerate_wd = wd + '/exonerate/'
    pasa_dir = wd + 'PASA/'
    star_out = wd + '/STAR/'
    trin_dir = wd + '/Trinity/'

    logistic.check_create_dir(trin_dir)
    logistic.check_create_dir(star_out)
    logistic.check_create_dir(pasa_dir)
    logistic.check_create_dir(gmap_wd)
    logistic.check_create_dir(exonerate_wd)
    if args.long_reads:
        consensus_wd = (wd + '/consensus/')
        logistic.check_create_dir(consensus_wd)

    logistic.check_gmap(threads_use, 'samse', args.min_intron_length,
                        args.max_intron_length, args.end_exon, gmap_wd,
                        args.verbose)

    if args.repeat_masked:
        genome_gmap = mseq.maskedgenome(gmap_wd, ref, args.repeat_masked)
    else:
        genome_gmap = ref

    if args.short_reads or args.long_reads:
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(
            ('\n###STAR MAPPING  STARTED AT:\t' + now + '\t###\n'))
        if args.short_reads.endswith(fastq):
            if ',' in args.short_reads:
                pairedEndFiles = args.short_reads.split(',')
                short_1 = os.path.abspath(pairedEndFiles[0])
                short_2 = os.path.abspath(pairedEndFiles[1])
                short_reads_file = [short_1, short_2]
            else:
                short_reads_file = os.path.abspath(args.short_reads)
            short_bam = mapping.star(ref, short_reads_file, threads_use,
                                     args.max_intron_length, star_out,
                                     args.verbose)
            short_sorted_bam = mapping.samtools_sort(short_bam, threads_use,
                                                     wd, args.verbose)
            final_files.append(short_sorted_bam)
        # BAM SORTED FILES GET IN HERE
        elif args.short_reads.endswith("bam"):
            logistic.check_create_dir(star_out)
            short_sorted_bam = os.path.abspath(args.short_reads)
            bam_file = short_sorted_bam.split("/")
            short_bam = star_out + "/" + bam_file[-1]
            if not os.path.exists(ref):
                os.link(short_sorted_bam, short_bam)
        else:
            short_sorted_bam = False
            sys.stdout.write('No short reads file')
        if args.long_reads.endswith(fastq) or args.long_reads.endswith(fasta):
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(("\n###FILTERING OUT LONG READS STARTED AT:\t" +
                              now + "\t###\n"))
            long_fasta = mseq.filterLongReads(args.long_reads,
                                              args.assembly_overlap_length,
                                              args.max_long_read,
                                              gmap_wd,
                                              args.adapter,
                                              threads_use,
                                              a=True)
            if not short_sorted_bam:
                # If short reads have been mapped dont do it
                now = datetime.datetime.now().strftime(fmtdate)
                sys.stdout.write(('\n###GMAP\t' + now + 't###\n'))
                long_sam = mapping.gmap('sam',
                                        genome_gmap,
                                        long_fasta,
                                        threads_use,
                                        'samse',
                                        args.min_intron_length,
                                        args.max_intron_length,
                                        args.end_exon,
                                        gmap_wd,
                                        args.verbose,
                                        Fflag=False)
                long_sorted_bam = mapping.sam_to_sorted_bam(
                    long_sam, threads_use, wd, args.verbose)
                final_files.append(long_sorted_bam)
            else:
                long_sorted_bam = False
        else:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('\n###NO LONG READS FILE\t' + now + '\t###\n'))
            long_sorted_bam = False
        if short_sorted_bam:  # If there are short reads, these will serve to the transcript assembly pipeline
            default_bam = short_sorted_bam
        else:
            default_bam = long_sorted_bam
        # TRANSCRIPT ASSEMBLY
        # TRINITY
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('\n###TRINITY STARTS AT:\t' + now + '\t###\n'))
        if int(threads_use) > 1:
            trinity_cpu = int(int(threads_use) / int(2))
        else:
            trinity_cpu = int(threads_use)
        trinity_out = transcripts.trinity(default_bam, trin_dir,
                                          args.max_intron_length, trinity_cpu,
                                          args.verbose)

    else:
        sys.exit("### NO READS TO USE ###")

    if args.long_reads:
        if not long_sorted_bam:
            long_sam = mapping.gmap('sam',
                                    genome_gmap,
                                    long_fasta,
                                    threads_use,
                                    'samse',
                                    args.min_intron_length,
                                    args.max_intron_length,
                                    args.end_exon,
                                    gmap_wd,
                                    args.verbose,
                                    Fflag=False)
            long_sorted_bam = mapping.sam_to_sorted_bam(
                long_sam, threads_use, wd, args.verbose)
            final_files.append(long_sorted_bam)
        file_name = consensus_wd + 'mergedGmapEvm.beforeAssembly.gff3'

        mergedmapGFF3 = logistic.catTwoBeds(long_sorted_bam, args.upgrade,
                                            file_name, args.verbose)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n\t###GFFREAD\t" + now + "\t###\n"))

        gffread_fasta_file = consensus.gffread(mergedmapGFF3, ref,
                                               consensus_wd, args.verbose)
        # HERE WE STORE THE SEQUENCE IN A DICTIONARY
        fake = []
        long_fasta = mseq.filterLongReads(gffread_fasta_file,
                                          args.assembly_overlap_length,
                                          args.max_long_read,
                                          consensus_wd,
                                          fake,
                                          threads_use,
                                          a=False)

        gffreadDict = consensus.fasta2Dict(gffread_fasta_file)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n\t#CLUSTERING\t" + now + "\t###\n"))

        cluster_list = consensus.cluster_pipeline(mergedmapGFF3,
                                                  args.assembly_overlap_length,
                                                  args.stranded, args.verbose)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(
            ("\n\t#CONSENSUS FOR EACH CLUSTER\t" + now + "\t###\n"))
        tmp_wd = consensus_wd + 'tmp/'
        logistic.check_create_dir(tmp_wd)
        tmp_assembly_file = tmp_wd + 'assembly.fasta'
        if os.path.isfile(tmp_assembly_file):
            sys.stdout.write('No assembly')
        else:
            consensus.generate_fasta(cluster_list, gffreadDict,
                                     args.cluster_min_evidence,
                                     args.cluster_max_evidence,
                                     args.assembly_overlap_length, tmp_wd)
            consensus.assembly(args.assembly_overlap_length,
                               args.assembly_percent_identity, threads_use,
                               tmp_wd, args.verbose)
            utrs.lengthSupport(tmp_wd, threads_use)

        tmp_consensus = os.path.join(consensus_wd, 'tmp/')
        collect.parse_only(args.assembly_read_threshold, tmp_consensus,
                           args.verbose)
        tmp_assembly = collect.cat_assembled(tmp_consensus)
        tmp_assembly_all = collect.cat_assembled_all(tmp_consensus)

        merged_fasta_filename = consensus_wd + 'assembly.wEVM.fasta'
        collect.add_EVM(gffread_fasta_file, tmp_assembly,
                        merged_fasta_filename)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(
            ("\n###MAPPING CONSENSUS ASSEMBLIES\t" + now + "\t###\n"))
        consensus_mapped_gff3 = mapping.gmap('cons',
                                             genome_gmap,
                                             merged_fasta_filename,
                                             threads_use,
                                             'gff3_gene',
                                             args.min_intron_length,
                                             args.max_intron_length,
                                             args.end_exon,
                                             gmap_wd,
                                             args.verbose,
                                             Fflag=True)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n###GETTING THE STRAND RIGHT\t" + now + "\t###\n"))
        strand_mapped_gff3 = grs.strand(args.upgrade, consensus_mapped_gff3,
                                        ref, threads_use, gmap_wd,
                                        args.verbose)
        gff_pasa = grs.appendID(strand_mapped_gff3)
        no_overl = grs.removeOverlap(gff_pasa, args.verbose)
        no_disc = grs.removeDiscrepancy(no_overl, args.upgrade, args.verbose)
        uniq_gene = grs.newNames(no_disc)

        finalupdate3 = grs.genename(uniq_gene, args.prefix_gene, args.verbose)
        print(("\n###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))
        finalupdate4 = grs.exonerate(ref, finalupdate3, threads_use,
                                     exonerate_wd, args.verbose)
        finalupdate5 = grs.genename(finalupdate4, args.prefix_gene,
                                    args.verbose)

        # HERE WE COMBINE TRINITY OUTPUT AND THE ASSEMBLY OUTPUT TO RUN AGAIN
        # PASA TO CORRECT SMALL ERRORS

        sys.stdout.write(
            ("\n###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))
        round_n = 0
        fasta_all = logistic.cat_two_fasta(trinity_out, tmp_assembly_all,
                                           long_fasta, pasa_dir)
        round_n += 1
        pasa.create_pasa_database(pasa_dir, args.pasa_db, args.verbose)
        #align_pasa_conf = pasa.pasa_configuration(pasa_dir, args.pasa_db, args.verbose)
        finalupdate = pasa.update_database(threads_use, str(round_n), pasa_dir,
                                           args.pasa_db, ref, fasta_all,
                                           finalupdate5, args.verbose)
        round_n += 1
        finalupdate2 = pasa.update_database(threads_use, str(round_n),
                                            pasa_dir, args.pasa_db, ref,
                                            fasta_all, finalupdate,
                                            args.verbose)
        final_update = grs.genename(finalupdate2, args.prefix_gene,
                                    args.verbose)

        final_files.append(final_update)

        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(
            ('\n###CREATING OUTPUT DIRECTORY\t' + now + '\t###\n'))
        final_output_dir = os.path.join(output_dir, args.species + '_output')
        logistic.check_create_dir(final_output_dir)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(("\n##PLACING OUTPUT FILES IN OUTPUT DIRECTORY\t" +
                          now + "\t###\n"))

        for filename in final_files:
            if filename != '':
                logistic.copy_file(filename, final_output_dir)
                cmdstring = "chmod -R 775 %s" % wd
                os.system(cmdstring)
        if not args.keep_tmp:
            temp_dir.cleanup()

    sys.exit("##### LOREAN FINISHED HERE. GOOD BYE. #####\n")