示例#1
0
def main():
    parser = argparse.ArgumentParser(prog='patho_typing.py',
                                     description='In silico pathogenic typing directly from raw Illumina reads',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', help='Version information', action='version',
                        version='{prog} v{version}'.format(prog=parser.prog, version=__version__))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'),
                                 type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'),
                                 help='Path to single OR paired-end fastq files. If two files are passed, they will be'
                                      ' assumed as being the paired fastq files', required=True)
    parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'),
                                 help='Species name', required=True)

    parser_optional_general = parser.add_argument_group('General facultative options')
    parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/',
                                         help='Path to the directory where the information will be stored',
                                         required=False, default='.')
    parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
                                         required=False, default=1)
    parser_optional_general.add_argument('--trueCoverage', action='store_true',
                                         help='Assess true coverage before continue typing')
    parser_optional_general.add_argument('--noCheckPoint', action='store_true',
                                         help='Ignore the true coverage checking point')
    parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N',
                                         help='Minimum typing percentage of target reference gene sequence covered to'
                                              ' consider a gene to be present (value between [0, 100])', required=False)
    parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N',
                                         help='Minimum typing percentage of identity of reference gene sequence covered'
                                              ' to consider a gene to be present (value between [0, 100]). One INDEL'
                                              ' will be considered as one difference', required=False)
    parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N',
                                         help='Minimum typing gene average coverage depth of present positions to'
                                              ' consider a gene to be present (default is 1/3 of average sample'
                                              ' coverage or 15x)', required=False)
    parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true',
                                         help='Do not remove ReMatCh consensus sequences')
    parser_optional_general.add_argument('--debug', action='store_true',
                                         help='DeBug Mode: do not remove temporary files')

    args = parser.parse_args()

    if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100):
        parser.error('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100):
        parser.error('--minGeneIdentity should be a value between [0, 100]')

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    script_path = utils.general_information(logfile, __version__, args.outdir, time_str)
    print('\n')

    rematch = include_rematch_dependencies_path()

    args.fastq = [fastq.name for fastq in args.fastq]

    reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \
    typing_sequences, typing_headers, typing_rules, typing_config = \
        set_reference(args.species, args.outdir, script_path, args.trueCoverage)
    original_reference_file = str(reference_file)

    confirm_genes_fasta_rules(typing_headers, typing_rules)

    run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1)
    if run_successfully:
        rematch_dir = os.path.join(args.outdir, 'rematch', '')
        if not os.path.isdir(rematch_dir):
            os.makedirs(rematch_dir)

        if args.trueCoverage:
            if trueCoverage_file is not None:
                trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '')
                if not os.path.isdir(trueCoverage_dir):
                    os.makedirs(trueCoverage_dir)

                print('\n')
                run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir,
                                                               args.threads)
                if run_successfully:
                    run_successfully = indexAlignment(trueCoverage_bam)
                    if run_successfully:
                        reference_file = os.path.join(trueCoverage_dir, 'reference.fasta')
                        write_sequeces(reference_file, trueCoverage_sequences)
                        index_fasta_samtools(reference_file, None, None, True)
                        config = parse_config(trueCoverage_config)
                        runtime, run_successfully, sample_data_general, data_by_gene = \
                            run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam,
                                                    args.threads, config['length_extra_seq'],
                                                    config['minimum_depth_presence'], config['minimum_depth_call'],
                                                    config['minimum_depth_frequency_dominant_allele'],
                                                    config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                                    args.debug, args.doNotRemoveConsensus)

                        if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \
                                sample_data_general['number_absent_genes'] is not None and \
                                sample_data_general['number_genes_multiple_alleles'] is not None:
                            if args.minGeneDepth is None:
                                args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                                    sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                                    15

                            exit_info = []
                            if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']:
                                exit_info.append('Sample coverage ({mean}) lower than the minimum'
                                                 ' required ({minimum})'
                                                 ''.format(mean=sample_data_general['mean_sample_coverage'],
                                                           minimum=config['minimum_read_coverage']))
                            if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']:
                                exit_info.append('Number of absent genes ({number}) higher than the'
                                                 ' maximum allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_absent_genes'],
                                                           maximum=config['maximum_number_absent_genes']))
                            if sample_data_general['number_genes_multiple_alleles'] > \
                                    config['maximum_number_genes_multiple_alleles']:
                                exit_info.append('Number of genes with multiple alleles'
                                                 ' ({number}) higher than the maximum'
                                                 ' allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_genes_multiple_alleles'],
                                                           maximum=config['maximum_number_genes_multiple_alleles']))

                            if len(exit_info) > 0:
                                print('\n' + '\n'.join(exit_info) + '\n')
                                e = 'TrueCoverage requirements not fulfilled'
                                print('\n' + e + '\n')
                                if not args.noCheckPoint:
                                    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                    _ = utils.runTime(start_time)
                                    sys.exit(e)
                        else:
                            e = 'TrueCoverage module did not run successfully'
                            print('\n' + e + '\n')
                            if not args.noCheckPoint:
                                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                _ = utils.runTime(start_time)
                                sys.exit(e)

                        print('\n')
                        typing_dir = os.path.join(rematch_dir, 'typing', '')
                        if not os.path.isdir(typing_dir):
                            os.makedirs(typing_dir)
                        run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads)
                        if run_successfully:
                            run_successfully = indexAlignment(bam_file)
                            if run_successfully:
                                reference_file = os.path.join(typing_dir, 'reference.fasta')
                                write_sequeces(reference_file, typing_sequences)
                                index_fasta_samtools(reference_file, None, None, True)
                                rematch_dir = str(typing_dir)
                if not run_successfully:
                    if args.noCheckPoint:
                        clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                        _ = utils.runTime(start_time)
                        sys.exit('Something in the required TrueCoverage analysis went wrong')
            else:
                print('\n'
                      'WARNING: it was not found trueCoverage target files. trueCoverage will not run.'
                      '\n')

        if run_successfully:
            config = parse_config(typing_config)
            if args.minGeneCoverage is not None:
                config['minimum_gene_coverage'] = args.minGeneCoverage
            if args.minGeneIdentity is not None:
                config['minimum_gene_identity'] = args.minGeneIdentity

            runtime, run_successfully, sample_data_general, data_by_gene = \
                run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads,
                                        config['length_extra_seq'], config['minimum_depth_presence'],
                                        config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'],
                                        config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                        args.debug, args.doNotRemoveConsensus)
            if run_successfully and data_by_gene is not None:
                if args.minGeneDepth is None:
                    args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                        sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                        15

                _, _, _ = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'],
                                        config['minimum_gene_identity'], args.minGeneDepth, args.outdir)
            else:
                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                _ = utils.runTime(start_time)
                sys.exit('ReMatCh run for pathotyping did not run successfully')
        else:
            clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
            _ = utils.runTime(start_time)
            sys.exit('Something did not run successfully')

    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)

    print('\n')
    _ = utils.runTime(start_time)
示例#2
0
def reads_subcommand(args):
    msg = []
    # if args.reference is None and args.org is None:
    #     argparse.ArgumentParser.error('--reference or --org must be provided')
    if args.minGeneCoverage < 0 or args.minGeneCoverage > 100:
        msg.append('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity < 0 or args.minGeneIdentity > 100:
        msg.append('--minGeneIdentity should be a value between [0, 100]')

    if len(msg) > 0:
        argparse.ArgumentParser(prog='assembly subcommand options').error(
            '\n'.join(msg))

    rematch_script = include_rematch_dependencies_path()

    utils.required_programs({'rematch.py': ['--version', '>=', '4.0.1']})

    args.fastq = [os.path.abspath(fastq.name) for fastq in args.fastq]

    folders_2_remove = []

    references_dir = os.path.join(args.outdir, 'references', '')
    if not os.path.isdir(references_dir):
        os.makedirs(references_dir)
    folders_2_remove.append(references_dir)

    references_headers = {}

    clean_run_rematch = True

    if args.reference is not None:

        clean_run_rematch = False

        args.reference = [
            os.path.abspath(reference) for reference in args.reference
        ]

        reference_files = {}

        for reference in args.reference:
            fasta_file, index_files, pickle_file = check_reference_exist(
                reference)
            if not fasta_file and len(index_files) == 0 and not pickle_file:
                sys.exit(
                    'Missing reference fasta file, Bowtie2 index of pickle file for {}'
                    .format(reference))
            reference_files[reference] = fasta_file, index_files, pickle_file

        references_to_use = []

        for reference in args.reference:

            reference_files_found = reference_files[reference]

            reference_file = reference

            # Create symlink to pickle file
            if reference_files_found[2]:
                symlink = os.path.join(
                    references_dir, os.path.basename(reference_file + '.pkl'))
                if os.path.islink(symlink):
                    os.unlink(symlink)
                os.symlink(reference_file + '.pkl', symlink)
                header_gene_list, _ = utils.extractVariableFromPickle(
                    reference_file + '.pkl')
            else:
                new_reference_file, header_gene_list, seq_reference_dict = \
                    run_rematch.clean_headers_reference_file(reference_file=reference_file, outdir=references_dir)
                if new_reference_file != reference_file:
                    utils.Bcolors_print(
                        'WARNING: Sequences headers were renamed for {}'.
                        format(reference), 'WARNING')
                    reference_file = new_reference_file

                pickle_file = os.path.join(
                    references_dir,
                    os.path.basename(reference_file) + '.pkl')
                utils.saveVariableToPickle(
                    (header_gene_list, seq_reference_dict), pickle_file)

            if reference_file == reference:

                # Create symlinks to reference file
                if reference_files_found[0]:
                    symlink = os.path.join(references_dir,
                                           os.path.basename(reference_file))
                    if os.path.islink(symlink):
                        os.unlink(symlink)
                    os.symlink(reference_file, symlink)
                    reference_file = symlink
                else:
                    # Create reference file if does not exist

                    # From index files
                    if len(reference_files_found[1]) > 0:
                        run_successfully, reference_file = \
                            run_rematch.run_bowtie_inspect(index_without_sufix=reference_file, outdir=references_dir)
                        if not run_successfully:
                            sys.exit(
                                'Something went wrong while creating the reference fasta file from Bowtie2'
                                ' index {}'.format(reference_file))

                    # From pickle file
                    elif reference_files_found[2]:
                        _, seq_reference_dict = utils.extractVariableFromPickle(
                            reference_file + '.pkl')
                        reference_file = os.path.join(
                            references_dir, os.path.basename(reference_file))
                        write_seq_from_sequence_dict(seq_reference_dict,
                                                     reference_file)

                # Create symlinks to index files
                if len(reference_files_found[1]) > 0:
                    for index_file in reference_files_found[1]:
                        symlink = os.path.join(references_dir,
                                               os.path.basename(index_file))
                        if os.path.islink(symlink):
                            os.unlink(symlink)
                        os.symlink(index_file, symlink)

            references_to_use.append(reference_file)
            references_headers[reference_file] = dict(header_gene_list)

        args.reference = references_to_use
    else:
        args.reference, config = get_fasta_config(args.org)

        references_to_use = []

        for reference in args.reference:
            new_reference_file, header_gene_list, seq_reference_dict = \
                run_rematch.clean_headers_reference_file(reference_file=reference, outdir=references_dir)

            if new_reference_file != reference:
                utils.Bcolors_print(
                    'WARNING: Sequences headers were renamed for {}'.format(
                        reference), 'WARNING')
                references_to_use.append(new_reference_file)
                references_headers[new_reference_file] = dict(header_gene_list)
                pickle_file = os.path.join(
                    references_dir,
                    os.path.basename(new_reference_file) + '.pkl')
            else:
                symlink = os.path.join(references_dir,
                                       os.path.basename(reference))
                if os.path.islink(symlink):
                    os.unlink(symlink)
                os.symlink(reference, symlink)
                references_to_use.append(symlink)
                references_headers[symlink] = dict(header_gene_list)
                pickle_file = os.path.join(references_dir,
                                           os.path.basename(symlink) + '.pkl')

            utils.saveVariableToPickle((header_gene_list, seq_reference_dict),
                                       pickle_file)

        args.reference = references_to_use

        config = parse_config(config)
        args.extraSeq = config['length_extra_seq']
        args.minCovPresence = config['minimum_depth_presence']
        args.minCovCall = config['minimum_depth_call']
        args.minGeneCoverage = config['minimum_gene_coverage']
        args.typeSeparator = '_'

        print('\n'
              'Settings that will be used:\n'
              '    reference: {reference}\n'
              '    extraSeq: {extraSeq}\n'
              '    minCovPresence: {minCovPresence}\n'
              '    minCovCall: {minCovCall}\n'
              '    minGeneCoverage: {minGeneCoverage}\n'
              '    Type separator character: {typeSeparator}'
              '\n'.format(reference=args.reference,
                          extraSeq=args.extraSeq,
                          minCovPresence=args.minCovPresence,
                          minCovCall=args.minCovCall,
                          minGeneCoverage=args.minGeneCoverage,
                          typeSeparator=args.typeSeparator))

    pickles_folder = os.path.join(args.outdir, 'pickles', '')

    # Run ReMatCh
    pickle_file = os.path.join(pickles_folder, 'rematch_module.pkl')
    if args.resume and os.path.isfile(pickle_file):
        print('ReMatCh module already run')
        references_results, module_dir = utils.extractVariableFromPickle(
            pickle_file)
        folders_2_remove.append(module_dir)
    else:
        _, references_results, module_dir = run_rematch.run_rematch(
            rematch_script,
            args.outdir,
            args.reference,
            args.fastq,
            args.threads,
            args.extraSeq,
            args.minCovPresence,
            args.minCovCall,
            args.minFrequencyDominantAllele,
            args.minGeneCoverage,
            args.minGeneIdentity,
            args.debug,
            args.doNotRemoveConsensus,
            args.bowtieAlgo,
            clean_run_rematch=clean_run_rematch)
        folders_2_remove.append(module_dir)
        utils.saveVariableToPickle([references_results, module_dir],
                                   pickle_file)

    return folders_2_remove, references_results, args.reference, references_headers