示例#1
0
def run_miRNA(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_miRNA):
        ## information for join reads
        help_XICRA.help_miRNA()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    functions.aesthetics_functions.pipeline_header('XICRA')
    functions.aesthetics_functions.boxymcboxface("miRNA analysis")
    print("--------- Starting Process ---------")
    functions.time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## user software selection
    print("+ Software for miRNA analysis selected:")
    print(options.soft_name)

    ## get files
    print('+ Getting files from input folder... ')
    if options.pair:
        options.pair = False  ## set paired-end to false for further prepocessing
        if options.noTrim:
            print('+ Mode: fastq.\n+ Extension: ')
            print("[ fastq, fq, fastq.gz, fq.gz ]\n")
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "fastq",
                ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
        else:
            print('+ Mode: join.\n+ Extension: ')
            print("[_joined.fastq]\n")
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "join", ['_joined.fastq'], options.debug)
    else:
        if options.noTrim:
            print('+ Mode: fastq.\n+ Extension: ')
            print("[ fastq, fq, fastq.gz, fq.gz ]\n")
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "fastq",
                ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
        else:
            print('+ Mode: join.\n+ Extension: ')
            print("[_joined.fastq]\n")
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "trim", ['_trim'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## Additional sRNAbench or miRTop options

    ## species
    print("+ Species provided:", options.species)

    ############################################################
    ## miRNA information: hairpin, mature, str, gff3
    ############################################################
    if not (options.database):
        install_path = os.path.dirname(os.path.realpath(__file__))
        options.database = os.path.join(install_path, "db_files")
    else:
        options.database = os.path.abspath(options.database)

    print("+ Create folder to store results: ", options.database)
    functions.files_functions.create_folder(options.database)

    ## miRNA_gff: can be set as automatic to download from miRBase
    if not options.miRNA_gff:
        print("+ File miRNA gff3 annotation")
        if Debug:
            print(
                colored("\t** ATTENTION: No miRNA gff file provided",
                        'yellow'))
        print(colored("\t** Download it form miRBase", 'green'))
        file_name = options.species + ".gff3"
        ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/genomes/" + file_name
        options.miRNA_gff = functions.main_functions.urllib_request(
            options.database, ftp_site, file_name, Debug)

    else:
        print("+ miRNA gff file provided")
        options.miRNA_gff = os.path.abspath(options.miRNA_gff)

    ## hairpin: can be set as automatic to download from miRBase
    if not options.hairpinFasta:
        print("+ File hairpin fasta")
        if Debug:
            print(
                colored("\t** ATTENTION: No hairpin fasta file provided",
                        'yellow'))
        print(colored("\t** Download it form miRBase", 'green'))
        ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz"
        options.hairpinFasta = functions.main_functions.urllib_request(
            options.database, ftp_site, "hairpin.fa.gz", Debug)

    else:
        print("+ hairpin fasta file provided")
        options.hairpinFasta = os.path.abspath(options.hairpinFasta)

    ## mature: can be set as automatic to download from miRBase
    if not options.matureFasta:
        print("+ File mature fasta")
        if Debug:
            print(
                colored("\t** ATTENTION: No mature miRNA fasta file provided",
                        'yellow'))
        print(colored("\t** Download it form miRBase", 'green'))
        ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz"
        options.matureFasta = functions.main_functions.urllib_request(
            options.database, ftp_site, "mature.fa.gz", Debug)

    else:
        print("+ mature fasta file provided")
        options.matureFasta = os.path.abspath(options.matureFasta)

    ## miRBase str: can be set as automatic to download from miRBase
    if not options.miRBase_str:
        print("+ File miRBase str annotation")
        if Debug:
            print(
                colored("\t** ATTENTION: No miRBase_str file provided",
                        'yellow'))
        print(colored("\t** Download it form miRBase", 'green'))
        ftp_site = "ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.str.gz"
        options.miRBase_str = functions.main_functions.urllib_request(
            options.database, ftp_site, "miRNA.str.gz", Debug)
        ## extract

    else:
        print("+ miRBase_str file provided")
        options.miRBase_str = os.path.abspath(options.miRBase_str)
    ############################################################

    ## generate output folder, if necessary
    if not options.project:
        print("\n+ Create output folder(s):")
        functions.files_functions.create_folder(outdir)

    ## for samples
    outdir_dict = functions.files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "miRNA", options.debug)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = functions.main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print("+ Create a miRNA analysis for each sample retrieved...")

    ## call miRNA_analysis:
    ## Get user software selection: sRNAbench, optimir, ...
    ## Standarize using miRTop

    ## dictionary results
    global results_df
    results_df = pd.DataFrame(columns=("name", "soft", "filename"))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["new_name"])

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(miRNA_analysis, sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job,
                            options.miRNA_gff, options.soft_name,
                            options.matureFasta, options.hairpinFasta,
                            options.miRBase_str, options.species,
                            options.database, Debug): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ miRNA analysis is finished...")
    print("+ Let's summarize all results...")

    ## outdir
    outdir_report = functions.files_functions.create_subfolder(
        "report", outdir)
    expression_folder = functions.files_functions.create_subfolder(
        "miRNA", outdir_report)

    ## debugging messages
    if options.debug:
        print(results_df)

    ## merge all parse gtf files created
    print("+ Summarize miRNA analysis for all samples...")
    generate_DE.generate_DE(results_df, options.debug, expression_folder)

    print("\n*************** Finish *******************")
    start_time_partial = functions.time_functions.timestamp(start_time_total)
    print("\n+ Exiting miRNA module.")
    return ()
示例#2
0
文件: join.py 项目: azabalag/XICRA
def run_join(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_join_reads):
        ## information for join reads
        help_XICRA.help_join_reads()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    functions.aesthetics_functions.pipeline_header('XICRA')
    functions.aesthetics_functions.boxymcboxface("Join paired-end reads")
    print("--------- Starting Process ---------")
    functions.time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## Percentage difference for joining sequences
    if not options.perc_diff:
        options.perc_diff = 0

    print('+ Getting files from input folder... ')
    ## get files
    if options.noTrim:
        print('+ Mode: fastq.\n+ Extension: ')
        print("[ fastq, fq, fastq.gz, fq.gz ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
            options.debug)
    else:
        print('+ Mode: trim.\n+ Extension: ')
        print("[ _trim_ ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim_'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        functions.files_functions.create_folder(outdir)
    ## for samples
    outdir_dict = functions.files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "join", options.debug)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = functions.main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print("+ Joining paired-end sequencing reads for each sample retrieved...")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["new_name"])

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(fastqjoin_caller,
                            sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job,
                            options.perc_diff, Debug): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ Joining reads has finished...")

    ## TODO: create statistics on joined reads
    ##

    print("\n*************** Finish *******************")
    start_time_partial = functions.time_functions.timestamp(start_time_total)
    print("\n+ Exiting join module.")
    return ()
示例#3
0
def run_biotype(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_RNAbiotype):
        ## information for join reads
        RNAbiotype.help_info()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    aesthetics_functions.pipeline_header('XICRA')
    aesthetics_functions.boxymcboxface("RNA biotype analysis")
    print("--------- Starting Process ---------")
    time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## get files
    print('+ Getting files from input folder... ')

    ## get files
    if options.noTrim:
        print('+ Mode: fastq.\n+ Extension: ')
        print("[ fastq, fq, fastq.gz, fq.gz ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
            options.debug)

    else:
        print('+ Mode: trim.\n+ Extension: ')
        print("[ _trim_ ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)

        ## Discard if joined reads: use trimmed single-end or paired-end
        pd_samples_retrieved = pd_samples_retrieved[
            pd_samples_retrieved['ext'] != '_joined']

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        files_functions.create_folder(outdir)

    ## for samples
    mapping_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "map", options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_outdir_dict **", 'yellow'))
        print(mapping_outdir_dict)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_total)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ##############################################
    ## map Reads
    ##############################################
    start_time_partial = mapReads_module(options, pd_samples_retrieved,
                                         mapping_outdir_dict, options.debug,
                                         max_workers_int, threads_job,
                                         start_time_partial, outdir)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_results **", 'yellow'))
        print(mapping_results)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    ## for samples
    biotype_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "biotype",
        options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: biotype_outdir_dict **", 'yellow'))
        print(biotype_outdir_dict)

    ## get RNAbiotype information
    RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict,
                                      options.annotation, options.debug,
                                      max_workers_int, threads_job)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print(
            "\n+ Generating a report using MultiQC module for featureCount analysis."
        )
        outdir_report = files_functions.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in biotype_outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        featureCount_report = files_functions.create_subfolder(
            "featureCount", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "featureCount",
                                           featureCount_report, "-dd 2")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % featureCount_report)

        ### Summarizing RNA biotype information
        biotype_report = files_functions.create_subfolder(
            "biotype", outdir_report)
        single_files_biotype = files_functions.create_subfolder(
            "samples", biotype_report)

        ## results
        dict_files = {}

        for samples in biotype_outdir_dict:
            featurecount_file = os.path.join(biotype_outdir_dict[samples],
                                             'featureCount.out.tsv')
            if files_functions.is_non_zero_file(featurecount_file):
                dict_files[samples] = featurecount_file
            ## copy pdf
            pdf_plot = main_functions.retrieve_matching_files(
                biotype_outdir_dict[samples], '.pdf', options.debug)
            if files_functions.is_non_zero_file(pdf_plot[0]):
                shutil.copy(pdf_plot[0], single_files_biotype)

        ## collapse all information
        all_data = RNAbiotype.generate_matrix(dict_files)

        ## print into excel/csv
        print('+ Table contains: ', len(all_data), ' entries\n')

        ## debugging messages
        if Debug:
            print("** DEBUG: all_data")
            print(all_data)

        ## set abs_csv_outfile to be in report folder
        ## copy or link files for each sample analyzed
        abs_csv_outfile = os.path.join(biotype_report, "summary.csv")
        all_data.to_csv(abs_csv_outfile)

        ## create plot: call R [TODO: implement in python]
        outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf")

        ## R scripts
        biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum',
                                           options.debug)
        rscript = set_config.get_exe("Rscript", options.debug)
        cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script,
                                            abs_csv_outfile, outfile_pdf)

        ##
        print("+ Create summary plot for all samples")
        callCode = system_call_functions.system_call(cmd_R_plot)

    print("\n*************** Finish *******************")
    start_time_partial = time_functions.timestamp(start_time_total)
    print("\n+ Exiting join module.")
    return ()
示例#4
0
def run_prep(options):
    """
    Main function of the prep module.
    
    This module prepares fastq files for later usage. It initially checks the length
    of the name and advises the user to rename samples if exceeded. 
    
    This module allows to user to copy files into the project folder initiate or only link using
    a symbolic link to avoid duplicated raw data. 

    """

    ## help_format option
    if (options.help_format):
        help_XICRA.help_fastq_format()
        exit()

    functions.aesthetics_functions.pipeline_header('XICRA')
    functions.aesthetics_functions.boxymcboxface("Preparing samples")
    print("--------- Starting Process ---------")
    functions.time_functions.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = os.path.abspath(options.output_folder)

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ## output folder
    print("\n+ Create output folder(s):")
    functions.files_functions.create_folder(outdir)

    ## project
    if options.detached:
        options.project = False
    else:
        options.project = True

    ## default options
    if not options.include_lane:
        options.include_lane = False

    if not options.include_all:
        options.include_all = False

    ### info
    final_dir = ""
    if (options.project):
        print(
            "+ Generate a directory containing information within the project folder provided"
        )
        final_dir = functions.files_functions.create_subfolder("info", outdir)
    else:
        final_dir = outdir

    ## get files
    print()
    functions.aesthetics_functions.print_sepLine("-", 50, False)
    print('+ Getting files from input folder... ')
    print('+ Mode: fastq.\n+ Extension: ')
    print("[ fastq, fq, fastq.gz, fq.gz ]\n")

    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
        options.debug)

    ## Information returned in pd_samples_retrieved
    ### sample, dirname, name, name_len, lane, read_pair, lane_file, ext, gz

    if options.debug:
        print(colored("** DEBUG: pd_samples_retrieved", 'yellow'))
        print(pd_samples_retrieved)
        #functions.print_all_pandaDF(pd_samples_retrieved)

    ## time stamp
    start_time_partial = functions.time_functions.timestamp(start_time_total)

    ## check character limitation
    list_lengths = pd_samples_retrieved.loc[:, 'name_len'].to_list()
    if any(i > 25 for i in list_lengths):
        print(
            colored(
                "\t ** Name lengths exceeds the 25 character limitation...",
                'yellow'))
        if not (options.rename):
            print(
                colored("** ERROR: Rename files or provide --rename option...",
                        'red'))
            exit()

    ### rename files
    if (options.rename):
        options.rename = os.path.abspath(options.rename)
        if not functions.files_functions.is_non_zero_file(options.rename):
            print(
                colored(
                    "** ERROR: File provided with rename information is not readable.",
                    'red'))
            print(options.rename)
            exit()

        names_retrieved = pd.read_csv(
            options.rename, sep=',', index_col=0, squeeze=True,
            header=None).to_dict()  ## read csv to dictionary
        if (options.debug):
            print(colored('** DEBUG: names_retrieved', 'yellow'))
            print(names_retrieved)

        ## TODO: check integrity of new names and special characters

        ## print to a file
        timestamp = functions.time_functions.create_human_timestamp()
        rename_details = final_dir + '/' + timestamp + '_prep_renameDetails.txt'
        rename_details_hd = open(rename_details, 'w')

        ## rename files
        for index, row in pd_samples_retrieved.iterrows():
            if (row['gz']):
                extension_string = row['ext'] + row['gz']
            else:
                extension_string = row['ext']

            if options.single_end:
                renamed = names_retrieved[row['name']] + '.' + extension_string
            else:
                renamed = names_retrieved[row['name']] + '_' + row[
                    'read_pair'] + '.' + extension_string

            ## modify frame
            pd_samples_retrieved.loc[index, 'new_file'] = renamed
            pd_samples_retrieved.loc[index,
                                     'new_name'] = names_retrieved[row['name']]
            ## save in file
            string = row['sample'] + '\t' + renamed + '\n'
            rename_details_hd.write(string)

            if (options.debug):
                print(colored('** DEBUG: rename', 'yellow'))
                print("Original: ", row['name'])
                print("Renamed: ", names_retrieved[row['name']])
                print("File:", renamed)

        rename_details_hd.close()

        ##elif (options.single_end): It should work for both
        print("+ Sample files have been renamed...")
    else:
        pd_samples_retrieved['new_file'] = pd_samples_retrieved['file']

    ## create outdir for each sample
    outdir_dict = functions.files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "raw", options.debug)

    ## merge option
    if (options.merge_Reads):
        print("+ Sample files will be merged...")
        ## TODO: check when rename option provided
        pd_samples_merged = sampleParser.merge.one_file_per_sample(
            pd_samples_retrieved, outdir_dict, options.threads, final_dir,
            options.debug)

        if (options.rename):
            print("+ Merge files have been renamed...")
        else:
            print("+ Sample files have been merged...")

        ## process is finished here
        print("\n*************** Finish *******************")
        start_time_partial = functions.time_functions.timestamp(
            start_time_total)

        print("+ Exiting prep module.")
        exit()

    ## debugging messages
    if (options.debug):
        print(colored("** DEBUG: pd_samples_retrieved", 'yellow'))
        #functions.print_all_pandaDF(pd_samples_retrieved)
        print(pd_samples_retrieved)
        print(colored("** DEBUG: outdir_dict", 'yellow'))
        print(outdir_dict)

    ## copy or create symbolic link for files
    if (options.copy_reads):
        print("+ Sample files will be copied...")
        ## print to a file
        timestamp = functions.time_functions.create_human_timestamp()
        copy_details = final_dir + '/' + timestamp + '_prep_copyDetails.txt'
        copy_details_hd = open(copy_details, 'w')
    else:
        print("+ Sample files will be linked...")

    list_reads = []
    for index, row in pd_samples_retrieved.iterrows():
        if (options.copy_reads):
            ## TODO: debug & set threads to copy faster
            shutil.copy(
                row['sample'],
                os.path.join(outdir_dict[row['new_name']], row['new_file']))
            string = row['sample'] + '\t' + os.path.join(
                outdir_dict[row['new_name']], row['new_file']) + '\n'
            copy_details_hd.write(string)
        else:
            list_reads.append(row['new_file'])

            if options.project:
                functions.files_functions.get_symbolic_link_file(
                    row['sample'],
                    os.path.join(outdir_dict[row['new_name']],
                                 row['new_file']))

    if (options.copy_reads):
        print("+ Sample files have been copied...")
        copy_details_hd.close()
    else:
        if not options.project:
            functions.files_functions.get_symbolic_link(list_reads, outdir)

    print("\n*************** Finish *******************")
    start_time_partial = functions.time_functions.timestamp(start_time_total)

    print("+ Exiting prep module.")
    return ()
示例#5
0
文件: trimm.py 项目: azabalag/XICRA
def run_trimm(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_trimm_adapters):
        ## help on trimm adapters
        help_XICRA.print_help_adapters()
        exit()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        help_XICRA.multiqc_help()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    functions.aesthetics_functions.pipeline_header('XICRA')
    functions.aesthetics_functions.boxymcboxface("Trimming samples")
    print("--------- Starting Process ---------")
    functions.time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    # Trimming adapters

    ## check adapters provided
    ## options.adapters_a
    ## options.adapters_A
    ## options.extra

    ## no adapters provided
    if (not options.adapters_a and not options.adapters_A
            and not options.extra):
        print(
            colored("** ERROR: No adapter trimming options provided...",
                    'red'))
        print("Please provide any option")
        exit()

    ## create dictionary with
    adapters_dict = {}
    if (options.adapters_a):
        adapters_dict['adapter_a'] = options.adapters_a

    if (options.adapters_a):
        adapters_dict['adapter_A'] = options.adapters_A

    ## get files
    print('+ Getting files from input folder... ')
    print('+ Mode: fastq.\n+ Extension: ')
    print("[ fastq, fq, fastq.gz, fq.gz ]\n")
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
        options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

        print(colored("**DEBUG: adapters_dict **", 'yellow'))
        print(adapters_dict)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        functions.files_functions.create_folder(outdir)
    ## for samples
    outdir_dict = functions.files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "trimm", options.debug)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = functions.main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print("+ Trimming adapters for each sample retrieved...")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["new_name"])

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(cutadapt_caller,
                            sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job, Debug,
                            adapters_dict, options.extra): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ Trimming samples has finished...")
    ## functions.time_functions.timestamp
    start_time_partial = functions.time_functions.timestamp(start_time_total)

    ## get files generated and generate symbolic link
    if not options.project:
        dir_symlinks = functions.files_functions.create_subfolder(
            'link_files', outdir)
        files2symbolic = []
        folders = os.listdir(outdir)

        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: generate symbolic links for each file in " +
                    dir_symlinks + "**", 'yellow'))

        for fold in folders:
            if fold.endswith(".log"):
                continue
            else:
                this_folder = outdir + '/' + fold
                subfiles = os.listdir(this_folder)
                for files in subfiles:
                    files_search = re.search(
                        r".*trim_R\d{1}.*",
                        files)  ## only paired-end. Todo: single end
                    if files_search:
                        files2symbolic.append(this_folder + '/' + files)

        functions.files_functions.get_symbolic_link(files2symbolic,
                                                    dir_symlinks)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report using MultiQC module.")
        outdir_report = functions.files_functions.create_subfolder(
            "report", outdir)

        ## call multiQC report module
        givenList = [v for v in outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        trimm_report = functions.files_functions.create_subfolder(
            "trimm", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "Cutadapt",
                                           trimm_report, "")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % trimm_report)

    print("\n*************** Finish *******************")
    start_time_partial = functions.time_functions.timestamp(start_time_total)
    print("\n+ Exiting trimm module.")
    exit()