Exemplo n.º 1
0
def run_parallel(_fn, fn_args, n_jobs=None, filter_results=False):
    if qconfig.memory_efficient:
        results_tuples = [_fn(*args) for args in fn_args]
    else:
        n_jobs = n_jobs or qconfig.max_threads
        if is_python2():
            from joblib2 import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        results_tuples = Parallel(n_jobs=n_jobs)(delayed(_fn)(*args)
                                                 for args in fn_args)
    results = []
    if results_tuples:
        if isinstance(results_tuples[0], list) or isinstance(
                results_tuples[0], tuple):
            results_cnt = len(results_tuples[0])
            if filter_results:
                results = [[
                    result_list[i] for result_list in results_tuples
                    if result_list[i]
                ] for i in range(results_cnt)]
            else:
                results = [[result_list[i] for result_list in results_tuples]
                           for i in range(results_cnt)]
        else:
            results = [
                result for result in results_tuples
                if result or not filter_results
            ]
    return results
Exemplo n.º 2
0
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log):
    log.info("\n== Compressing corrected reads (with gzip)")
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                compressed_reads_filenames = []
                for reads_file in value:
                    if not os.path.isfile(reads_file):
                        support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
                    to_compress.append(reads_file)
                    compressed_reads_filenames.append(reads_file + ".gz")
                reads_library[key] = compressed_reads_filenames
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Exemplo n.º 3
0
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath,
                      alignments_fpath_template, labels):
    # array of assemblies for each reference
    assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), [])
                              for ref_fpath in ref_fpaths])
    n_jobs = min(qconfig.max_threads, len(assemblies))
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)(
        asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template)
                                         for asm in assemblies)
    assemblies_dicts = [assembly[0] for assembly in assemblies]
    assemblies_by_ref = []
    for ref_fpath in ref_fpaths:
        ref_name = qutils.name_from_fpath(ref_fpath)
        not_sorted_assemblies = set([
            val for sublist in (assemblies_dicts[i][ref_name]
                                for i in range(len(assemblies_dicts)))
            for val in sublist
        ])
        sorted_assemblies = []
        for label in labels:  # sort by label
            for assembly in not_sorted_assemblies:
                if assembly.label == label:
                    sorted_assemblies.append(assembly)
                    break
        assemblies_by_ref.append((ref_fpath, sorted_assemblies))
    not_aligned_assemblies = [assembly[1] for assembly in assemblies]
    return assemblies_by_ref, not_aligned_assemblies
Exemplo n.º 4
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')
    tool_exec_fpath = compile_glimmer(logger)
    if not tool_exec_fpath:
        return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if qconfig.memory_efficient:
        results = Parallel(n_jobs=n_jobs)(
            delayed(predict_genes)(index, contigs_fpath, gene_lengths,
                                   out_dirpath, tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths))
    else:
        results = [
            predict_genes(index, contigs_fpath, gene_lengths, out_dirpath,
                          tool_dirpath, tmp_dirpath)
            for index, contigs_fpath in enumerate(contigs_fpaths)
        ]

    genes_by_labels = dict()
    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        label = qutils.label_from_fpath(contigs_fpath)
        genes_by_labels[label], unique, full_genes, partial_genes = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if full_genes is not None:
            genes = [
                '%s + %s part' % (full_cnt, partial_cnt)
                for full_cnt, partial_cnt in zip(full_genes, partial_genes)
            ]
            report.add_field(reporting.Fields.PREDICTED_GENES, genes)
        if unique is None and full_genes is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' +
                ('Run with the --debug option'
                 ' to see the command line.' if not qconfig.debug else '') %
                label)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
    return genes_by_labels
Exemplo n.º 5
0
def correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting):
    ## removing from contigs' names special characters because:
    ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc
    ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names)

    if qconfig.max_threads is None:
        qconfig.max_threads = 1

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    logger.main_info('  Pre-processing...')
    if not qconfig.memory_efficient:
        corrected_info = Parallel(n_jobs=n_jobs)(
            delayed(parallel_correct_contigs)(i, contigs_fpath,
                                              corrected_dirpath, labels)
            for i, contigs_fpath in enumerate(contigs_fpaths))
    else:
        corrected_info = [
            parallel_correct_contigs(i, contigs_fpath, corrected_dirpath,
                                     labels)
            for i, contigs_fpath in enumerate(contigs_fpaths)
        ]

    corrected_contigs_fpaths = []
    old_contigs_fpaths = []
    if any([
            is_fatal_error
            for (old_fpaths, corr_fpaths, broken_scaffold_fpaths, logs,
                 is_fatal_error) in corrected_info
    ]):
        exit(4)
    for contig_idx, (old_fpaths, corr_fpaths, broken_scaffold_fpaths, logs,
                     is_fatal_error) in enumerate(corrected_info):
        label = labels[contig_idx]
        logger.main_info('\n'.join(logs))
        for old_fpath in old_fpaths:
            old_contigs_fpaths.append(old_fpath)
            qconfig.assembly_labels_by_fpath[old_fpath] = label
        for corr_fpath, lengths in corr_fpaths:
            corrected_contigs_fpaths.append(corr_fpath)
            qconfig.assembly_labels_by_fpath[corr_fpath] = label
            add_lengths_to_report(lengths, reporting, corr_fpath)
        for broken_fpath, lengths in broken_scaffold_fpaths:
            old_contigs_fpaths.append(broken_fpath)
            corrected_contigs_fpaths.append(broken_fpath)
            qconfig.assembly_labels_by_fpath[broken_fpath] = label + '_broken'
            add_lengths_to_report(lengths, reporting, broken_fpath)

    if qconfig.draw_plots or qconfig.html_report:
        if not plotter_data.dict_color_and_ls:
            plotter_data.save_colors_and_ls(corrected_contigs_fpaths)

    return corrected_contigs_fpaths, old_contigs_fpaths
Exemplo n.º 6
0
def move_dataset_files(dataset_data,
                       dst,
                       ext_python_modules_home,
                       max_threads,
                       log,
                       gzip=False):
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                moved_reads_files = []
                for reads_file in value:
                    dst_filename = os.path.join(dst,
                                                os.path.basename(reads_file))
                    # TODO: fix problem with files with the same basenames in Hammer binary!
                    if not os.path.isfile(reads_file):
                        if (not gzip and os.path.isfile(dst_filename)) or (
                                gzip and os.path.isfile(dst_filename + '.gz')):
                            support.warning(
                                'file with corrected reads (' + reads_file +
                                ') is the same in several libraries', log)
                            if gzip:
                                dst_filename += '.gz'
                        else:
                            support.error(
                                'something went wrong and file with corrected reads ('
                                + reads_file + ') is missing!', log)
                    else:
                        shutil.move(reads_file, dst_filename)
                        if gzip:
                            to_compress.append(dst_filename)
                            dst_filename += '.gz'
                    moved_reads_files.append(dst_filename)
                reads_library[key] = moved_reads_files
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([
                    pigz_path, '-f', '-7', '-p',
                    str(max_threads), reads_file
                ], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(
                delayed(support.sys_call)(['gzip', '-f', '-7', reads_file])
                for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Exemplo n.º 7
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning(
            "GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname,
                                qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning(
            '  Sorry, can\'t use %s on this platform, skipping gene prediction.'
            % tool_name)
    elif not install_genemark():
        logger.warning(
            '  Can\'t copy the license key to ~/.gm_key, skipping gene prediction.'
        )
    else:
        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib2 import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        if not qconfig.memory_efficient:
            results = Parallel(n_jobs=n_jobs)(
                delayed(predict_genes)
                (index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath,
                 tmp_dirpath, gmhmm_p_function, prokaryote, num_threads)
                for index, fasta_fpath in enumerate(fasta_fpaths))
        else:
            results = [
                predict_genes(index, fasta_fpath, gene_lengths, out_dirpath,
                              tool_dirpath, tmp_dirpath, gmhmm_p_function,
                              prokaryote, num_threads)
                for index, fasta_fpath in enumerate(fasta_fpaths)
            ]

        if not is_license_valid(out_dirpath, fasta_fpaths):
            return

        genes_by_labels = dict()
        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            label = qutils.label_from_fpath(fasta_path)
            genes_by_labels[
                label], unique_count, full_genes, partial_genes = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE,
                                 unique_count)
            if full_genes is not None:
                genes = [
                    '%s + %s part' % (full_cnt, partial_cnt)
                    for full_cnt, partial_cnt in zip(full_genes, partial_genes)
                ]
                report.add_field(reporting.Fields.PREDICTED_GENES, genes)
            if unique_count is None and full_genes is None:
                logger.error(
                    '  ' + qutils.index_to_str(i) +
                    'Failed predicting genes in ' + label + '. ' +
                    ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                     if tool_name == 'GeneMark-ES'
                     and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            for dirpath in glob.iglob(tmp_dirpath + '*'):
                if os.path.isdir(dirpath):
                    shutil.rmtree(dirpath)

        logger.main_info('Done.')
        return genes_by_labels
Exemplo n.º 8
0
def compress_dataset_files(input_file, ext_python_modules_home, max_threads,
                           log, not_used_yaml_file, output_dir, gzip_output):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith("2."):
        import pyyaml2 as pyyaml
        from joblib2 import Parallel, delayed
    elif sys.version.startswith("3."):
        import pyyaml3 as pyyaml
        from joblib3 import Parallel, delayed

    dataset_data = pyyaml.load(open(input_file))
    remove_not_corrected_reads(output_dir)
    is_changed = False
    if gzip_output:
        is_changed = True
        pigz_path = support.which("pigz")
        if pigz_path:
            compressor = "pigz"
        else:
            compressor = "gzip"
        log.info("\n== Compressing corrected reads (with %s)" % compressor)
        to_compress = []
        for reads_library in dataset_data:
            for key, value in reads_library.items():
                if key.endswith("reads"):
                    compressed_reads_filenames = []
                    for reads_file in value:
                        compressed_reads_filenames.append(reads_file + ".gz")
                        to_compress.append(reads_file)
                    reads_library[key] = compressed_reads_filenames

        if len(to_compress):
            for reads_file in to_compress:
                if not isfile(reads_file):
                    support.error(
                        "something went wrong and file with corrected reads (%s) is missing!"
                        % reads_file, log)

            if pigz_path:
                for reads_file in to_compress:
                    support.sys_call([
                        pigz_path, "-f", "-7", "-p",
                        str(max_threads), reads_file
                    ], log)
            else:
                n_jobs = min(len(to_compress), max_threads)
                outputs = Parallel(n_jobs=n_jobs)(
                    delayed(support.sys_call)(["gzip", "-f", "-7", reads_file])
                    for reads_file in to_compress)
                for output in outputs:
                    if output:
                        log.info(output)

    if not_used_yaml_file != "":
        is_changed = True
        not_used_dataset_data = pyyaml.load(open(not_used_yaml_file))
        dataset_data += not_used_dataset_data
    if is_changed:
        with open(input_file, 'w') as f:
            pyyaml.dump(dataset_data,
                        f,
                        default_flow_style=False,
                        default_style='"',
                        width=float("inf"))
def nx_seq_junction(infilename1, infilename2, dst, log, silent=True):
    starttime = time.time()

    basename1 = os.path.basename(infilename1)
    if os.path.splitext(basename1)[1] == '.gz':
        basename1 = os.path.splitext(basename1)[0]
    basename2 = os.path.basename(infilename2)
    if os.path.splitext(basename2)[1] == '.gz':
        basename2 = os.path.splitext(basename2)[0]
    #open three outfiles
    splitfilenameleft = os.path.join(dst, 'R1_IJS7_' + basename1)
    splitfile1 = open(splitfilenameleft, 'w')

    splitfilenameright = os.path.join(dst, 'R2_IJS7_' + basename2)
    splitfile2 = open(splitfilenameright, 'w')

    unsplitfilename = os.path.join(
        dst, 'unsplit_IJS7_' + basename1.replace('_R1_', '_R1R2_'))
    unsplitfile = open(unsplitfilename, 'w')

    #jctstr = '(GGTTCATCGTCAGGCCTGACGATGAACC){e<=4}' # JS7 24/28 required results in ~92% detected in ion torrent
    # from NextClip: --adaptor_sequence GTTCATCGTCAGG -e --strict_match 22,11 --relaxed_match 20,10 eg strict 22/26 = 4 errors, relaxed 20/26 = 6 errors
    jctstr = '(GTTCATCGTCAGGCCTGACGATGAAC){e<=4}'  # try 22/26 to match NextClip strict (e<=6 for relaxed)

    #PARSE both files in tuples of 4 lines
    parserR1 = ParseFastQ(infilename1)
    parserR2 = ParseFastQ(infilename2)

    all_stats = JunctionStats()
    n_jobs = options_storage.threads
    while True:
        # prepare input
        reads1 = list(itertools.islice(parserR1, READS_PER_BATCH))
        reads2 = list(itertools.islice(parserR2, READS_PER_BATCH))
        if len(reads1) != len(reads2):
            support.error(
                "lucigen_nxmate.py, nx_seq_junction: "
                "number of left reads (%d) is not equal to number of right reads (%d)!"
                % (len(reads1), len(reads2)), log)
        if not reads1:
            break
        chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs)
        # processing
        outputs = Parallel(n_jobs=n_jobs)(
            delayed(nx_seq_junction_process_batch)(reads, jctstr)
            for reads in chunks)
        results, stats = [x[0] for x in outputs], [x[1] for x in outputs]
        # writing results
        for result, stat in zip(results, stats):
            write_to_files([splitfile1, splitfile2, unsplitfile], result)
            all_stats += stat
        if not silent:
            log.info(
                "==== nx_seq_junction progress: reads processed: %d, time elapsed: %s"
                % (all_stats.readcounter,
                   time.strftime('%H:%M:%S',
                                 time.gmtime(time.time() - starttime))))
    parserR1.close()
    parserR2.close()

    splitfile1.close()
    splitfile2.close()
    unsplitfile.close()

    if all_stats.readcounter == 0:
        support.error(
            "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!",
            log)
    if all_stats.splitcounter == 0:
        support.error(
            "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!",
            log)
    if not silent:
        #print some stats
        percentsplit = 100 * all_stats.splitcounter / all_stats.readcounter
        percentR1R2 = 100 * all_stats.R1R2jctcounter / all_stats.splitcounter
        percentR1 = 100 * all_stats.R1jctcounter / all_stats.splitcounter
        percentR2 = 100 * all_stats.R2jctcounter / all_stats.splitcounter
        log.info("==== nx_seq_junction info: processing finished!")
        log.info("==== nx_seq_junction info: %d reads processed" %
                 (all_stats.readcounter))
        log.info(
            "==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))"
            % (all_stats.splitcounter, percentsplit))
        log.info(
            "==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))"
            % (all_stats.R1R2jctcounter, percentR1R2))
        log.info(
            "==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))"
            % (all_stats.R1jctcounter, percentR1))
        log.info(
            "==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))"
            % (all_stats.R2jctcounter, percentR2))
        elapsedtime = time.strftime('%H:%M:%S',
                                    time.gmtime(time.time() - starttime))
        log.info("==== nx_seq_junction info: time elapsed: %s" % (elapsedtime))
    parserR1.close()
    parserR2.close()
    return splitfilenameleft, splitfilenameright, unsplitfilename
def chimera_clean(infilename1, infilename2, dst, log, silent=True):
    starttime = time.time()

    basename1 = os.path.basename(infilename1)
    if os.path.splitext(basename1)[1] == '.gz':
        basename1 = os.path.splitext(basename1)[0]
    basename2 = os.path.basename(infilename2)
    if os.path.splitext(basename2)[1] == '.gz':
        basename2 = os.path.splitext(basename2)[0]
    #open four outfiles
    outfilename1 = os.path.join(dst, 'mates_ICC4_' + basename1)
    outfile1 = open(outfilename1, 'w')

    slagfilename1 = os.path.join(dst, 'non-mates_ICC4_' + basename1)
    slagfile1 = open(slagfilename1, 'w')

    outfilename2 = os.path.join(dst, 'mates_ICC4_' + basename2)
    outfile2 = open(outfilename2, 'w')

    slagfilename2 = os.path.join(dst, 'non-mates_ICC4_' + basename2)
    slagfile2 = open(slagfilename2, 'w')

    #set up regular expression patterns for chimera codes- for illumin use the  reverse complements of right codes
    csslist1 = [
        '(TGGACTCCACTGTG){e<=1}', '(ACTTCGCCACTGTG){e<=1}',
        '(TGAGTCCCACTGTG){e<=1}', '(TGACTGCCACTGTG){e<=1}',
        '(TCAGGTCCACTGTG){e<=1}', '(ATGTCACCACTGTG){e<=1}',
        '(GTATGACCACTGTG){e<=1}', '(GTCTACCCACTGTG){e<=1}',
        '(GTTGGACCACTGTG){e<=1}', '(CGATTCCCACTGTG){e<=1}',
        '(GGTTACCCACTGTG){e<=1}', '(TCACCTCCACTGTG){e<=1}'
    ]

    csslist2 = [
        '(TCCAGACCAATGTG){e<=1}', '(ACATCACCAATGTG){e<=1}',
        '(TCACGACCAATGTG){e<=1}', '(TAGCACCCAATGTG){e<=1}',
        '(AACCTCCCAATGTG){e<=1}', '(ACAACTCCAATGTG){e<=1}',
        '(GTCTAACCAATGTG){e<=1}', '(TACACGCCAATGTG){e<=1}',
        '(GAGAACCCAATGTG){e<=1}', '(GAGATTCCAATGTG){e<=1}',
        '(GACCTACCAATGTG){e<=1}', '(AGACTCCCAATGTG){e<=1}'
    ]

    #PARSE both files in tuples of 4 lines
    parserR1 = ParseFastQ(infilename1)
    parserR2 = ParseFastQ(infilename2)

    all_stats = CleanStats()
    n_jobs = options_storage.threads
    while True:
        # prepare input
        reads1 = list(itertools.islice(parserR1, READS_PER_BATCH))
        reads2 = list(itertools.islice(parserR2, READS_PER_BATCH))
        if len(reads1) != len(reads2):
            support.error(
                "lucigen_nxmate.py, chimera_clean: "
                "number of left reads (%d) is not equal to number of right reads (%d)!"
                % (len(reads1), len(reads2)), log)
        if not reads1:
            break
        chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs)
        # processing
        outputs = Parallel(n_jobs=n_jobs)(
            delayed(chimera_clean_process_batch)(reads, csslist1, csslist2)
            for reads in chunks)
        results, stats = [x[0] for x in outputs], [x[1] for x in outputs]
        # writing results
        for result, stat in zip(results, stats):
            write_to_files([outfile1, outfile2, slagfile1, slagfile2], result)
            all_stats += stat
        if not silent:
            log.info(
                "==== chimera_clean progress: reads processed: %d, time elapsed: %s"
                % (all_stats.readcounter,
                   time.strftime('%H:%M:%S',
                                 time.gmtime(time.time() - starttime))))
    parserR1.close()
    parserR2.close()

    outfile1.close()
    slagfile1.close()
    outfile2.close()
    slagfile2.close()

    if all_stats.TOTALmatecounter + all_stats.slagcounter != all_stats.readcounter:
        support.error(
            "lucigen_nxmate.py, chimera_clean: error in the script somewhere! Unequal read counts!",
            log)
    if all_stats.readcounter == 0:
        support.error(
            "lucigen_nxmate.py, chimera_clean: error in input data! Number of processed reads is 0!",
            log)
    if not silent:
        #print some stats
        percentmates = 100. * all_stats.matecounter / all_stats.readcounter
        percentslag = 100. * all_stats.slagcounter / all_stats.readcounter
        log.info("==== chimera_clean info: processing finished!")
        log.info(
            "==== chimera_clean info: %d reads processed, %d true mate reads (%.2f %%) "
            "and %d non-mates/chimeras (%.2f %%)." %
            (all_stats.readcounter, all_stats.matecounter, percentmates,
             all_stats.slagcounter, percentslag))
        shortmates = all_stats.TOTALmatecounter - all_stats.matecounter
        log.info(
            "==== chimera_clean info: %d mates too short to keep after trimming"
            % shortmates)
        elapsedtime = time.strftime('%H:%M:%S',
                                    time.gmtime(time.time() - starttime))
        log.info("==== chimera_clean info: time elapsed: %s" % (elapsedtime))
        log.info("==== chimera_clean info: " + str(all_stats.csscounter))
    return outfilename1, outfilename2
Exemplo n.º 11
0
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath,
                  labels, blast_check_fpath, err_fpath):
    if not download_blast_binaries(filenames=blast_filenames):
        return None, None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [
                f for f in os.listdir(db_fpath) if f.endswith('.nsq')
            ]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath,
                                db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error(
                'You should specify path to BLAST database obtained by running makeblastdb command: '
                'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                exit_with_code=2)

    elif not download_blastdb():
        return None, None, None

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib2 import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(
            assembly.fpath, assembly.label, corrected_dirpath, err_fpath,
            blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info()
    species_scores = []
    species_by_assembly = dict()
    max_entries = 4
    replacement_dict = defaultdict(list)
    for label in labels:
        assembly_scores = []
        assembly_species = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            with open(res_fpath) as res_file:
                query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None
                for line in res_file:
                    fs = line.split()
                    if line.startswith('#'):
                        refs_for_query = 0
                        # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                        if 'Fields' in line:
                            fs = line.strip().split('Fields: ')[-1].split(', ')
                            query_id_col = fs.index(
                                'query id') if 'query id' in fs else 0
                            subj_id_col = fs.index(
                                'subject id') if 'subject id' in fs else 1
                            idy_col = fs.index(
                                '% identity') if '% identity' in fs else 2
                            len_col = fs.index(
                                'alignment length'
                            ) if 'alignment length' in fs else 3
                            score_col = fs.index(
                                'bit score') if 'bit score' in fs else 11
                    elif refs_for_query < max_entries and len(fs) > score_col:
                        query_id = fs[query_id_col]
                        organism_id = fs[subj_id_col]
                        idy = float(fs[idy_col])
                        length = int(fs[len_col])
                        score = float(fs[score_col])
                        if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                            seqname, taxons = parse_organism_id(organism_id)
                            if not seqname:
                                continue
                            species_name = get_species_name(seqname)
                            if species_name and 'uncultured' not in seqname:
                                if refs_for_query == 0:
                                    if species_name not in assembly_species:
                                        assembly_scores.append(
                                            (seqname, query_id, score))
                                        if taxons:
                                            taxons_for_krona[correct_name(
                                                seqname)] = taxons
                                        assembly_species.append(species_name)
                                        refs_for_query += 1
                                    else:
                                        seq_scores = [
                                            (query_name, seq_query_id,
                                             seq_score)
                                            for query_name, seq_query_id,
                                            seq_score in assembly_scores
                                            if get_species_name(
                                                query_name) == species_name
                                        ]
                                        if seq_scores and score > seq_scores[
                                                0][2]:
                                            assembly_scores.remove(
                                                seq_scores[0])
                                            assembly_scores.append(
                                                (seqname, query_id, score))
                                            if taxons:
                                                taxons_for_krona[correct_name(
                                                    seqname)] = taxons
                                            refs_for_query += 1
                                else:
                                    if seqname not in replacement_dict[
                                            query_id]:
                                        replacement_dict[query_id].append(
                                            seqname)
                                        refs_for_query += 1
        assembly_scores = sorted(assembly_scores, reverse=True)
        assembly_scores = assembly_scores[:qconfig.max_references]
        for seqname, query_id, score in assembly_scores:
            if not species_by_assembly or not any(
                    seqname in species_list
                    for species_list in species_by_assembly.values()):
                species_scores.append((seqname, query_id, score))
        species_by_assembly[label] = [
            seqname for seqname, query_id, score in assembly_scores
        ]
    if not species_scores:
        return None, None, None
    return species_scores, species_by_assembly, replacement_dict
Exemplo n.º 12
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict,
       operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    coords_dirpath = os.path.join(detailed_contigs_reports_dirpath,
                                  qconfig.minimap_output_dirname)
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        coords_dirpath = os.path.join(coords_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats(
        ref_fpath)

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt')
    res_file = open(result_fpath, 'w')

    containers = []
    for feature, feature_fpath in features_dict.items():
        containers.append(FeatureContainer([feature_fpath], feature))
    if not features_dict:
        logger.notice(
            'No file with genomic features were provided. '
            'Use the --features option if you want to specify it.\n',
            indent='  ')
    if operons_fpaths:
        containers.append(FeatureContainer(operons_fpaths, 'operon'))
    else:
        logger.notice(
            'No file with operons were provided. '
            'Use the -O option if you want to specify it.',
            indent='  ')
    for container in containers:
        if not container.fpaths:
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(
                fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No genomic features of type "' + container.kind +
                           '" were loaded.',
                           indent='  ')
            res_file.write('Genomic features of type "' + container.kind +
                           '" loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) +
                        ' genomic features of type "' + container.kind + '"')
            res_file.write('Genomic features of type "' + container.kind +
                           '" loaded: ' + str(len(container.region_list)) +
                           '\n')
            container.chr_names_dict = chromosomes_names_dict(
                container.kind, container.region_list,
                list(reference_chromosomes.keys()))

    ref_genes_num, ref_operons_num = None, None
    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        genomic_features = 0
        for container in containers:
            if container.kind == 'operon':
                ref_operons_num = len(container.region_list)
                report.add_field(reporting.Fields.REF_OPERONS,
                                 len(container.region_list))
            else:
                genomic_features += len(container.region_list)
        if genomic_features:
            ref_genes_num = genomic_features
            report.add_field(reporting.Fields.REF_GENES, genomic_features)

    # for cumulative plots:
    files_features_in_contigs = {
    }  #  "filename" : [ genes in sorted contigs (see below) ]
    files_unsorted_features_in_contigs = {
    }  #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}
    files_unsorted_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib2 import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    if not qconfig.memory_efficient:
        process_results = Parallel(n_jobs=n_jobs)(
            delayed(process_single_file)(
                contigs_fpath, index, coords_dirpath, genome_stats_dirpath,
                reference_chromosomes, ns_by_chromosomes, containers)
            for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    else:
        process_results = [
            process_single_file(contigs_fpath, index, coords_dirpath,
                                genome_stats_dirpath, reference_chromosomes,
                                ns_by_chromosomes, containers)
            for index, contigs_fpath in enumerate(aligned_contigs_fpaths)
        ]
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [
        process_results[i][1] for i in range(len(process_results))
    ]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [
            ref_lengths[i][ref] for i in range(len(ref_lengths))
        ]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.items():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) +
                       ' bp, ' + 'total length without N\'s: ' +
                       str(chr_len - len(ns_by_chromosomes[chr_name])) +
                       ' bp, maximal covered length: ' + str(aligned_len) +
                       ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' +
                   str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial',
         'operons', 'partial'))
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('=' * 120 + '\n')

    for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\
            in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_features_in_contigs[contigs_fpath] = features_in_contigs
        files_unsorted_features_in_contigs[
            contigs_fpath] = unsorted_features_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        files_unsorted_operons_in_contigs[
            contigs_fpath] = unsorted_operons_in_contigs
        full_found_genes.append(sum(features_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)

        res_file.write(
            '%-25s| %-10s| %-12s| %-10s|' %
            (assembly_name[:24], report.get_field(
                reporting.Fields.MAPPEDGENOME),
             report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count))

        genome_mapped.append(
            float(report.get_field(reporting.Fields.MAPPEDGENOME)))

        for (field, full,
             part) in [(reporting.Fields.GENES, genes_full, genes_part),
                       (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if ref_genes_num:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'features',
                                                files_features_in_contigs,
                                                ref_genes_num)
        if ref_operons_num:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        from . import plotter
        from quast_libs.ca_utils.misc import contigs_aligned_lengths
        if ref_genes_num:
            plotter.genes_operons_plot(
                ref_genes_num, aligned_contigs_fpaths,
                files_features_in_contigs,
                genome_stats_dirpath + '/features_cumulative_plot',
                'genomic features')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths,
                             contigs_aligned_lengths,
                             files_unsorted_features_in_contigs,
                             genome_stats_dirpath + '/features_frcurve_plot',
                             'genomic features')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_genes,
                genome_stats_dirpath + '/complete_features_histogram',
                '# complete genomic features')
        if ref_operons_num:
            plotter.genes_operons_plot(
                ref_operons_num, aligned_contigs_fpaths,
                files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths,
                             contigs_aligned_lengths,
                             files_unsorted_operons_in_contigs,
                             genome_stats_dirpath + '/operons_frcurve_plot',
                             'operons')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_operons,
                genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths,
                          genome_mapped,
                          genome_stats_dirpath + '/genome_fraction_histogram',
                          'Genome fraction, %',
                          top_value=100)

    logger.main_info('Done.')
    return containers