예제 #1
0
파일: fa2wes.py 프로젝트: zorrodong/PSiTE
def write_sample_normal(fout, rlen, args, normal_gsize, target_size):
    total_num_splits = 0
    if args.normal_rdepth > 0:
        total_rnum = int(
            (args.normal_rdepth * target_size) / (rlen * args.ontarget_ratio))
    else:
        total_rnum = args.normal_rnum
    logging.info(' Total number of reads to simulate for normal sample: %d',
                 total_rnum)
    MAX_READNUM = int(total_rnum * MAX_READFRAC)

    # two normal cell haplotypes
    for parental in 0, 1:
        ref = '{}/normal.parental_{}.fa'.format(args.normal, parental)
        proportion = genomesize(fasta=ref) / normal_gsize
        readnum = int(proportion * total_rnum)
        if readnum > MAX_READNUM:
            num_splits = int(numpy.ceil(readnum / MAX_READNUM))
            total_num_splits += num_splits
            for split in range(1, num_splits + 1):
                fout.write('  normal_normal.parental_{}_{}:\n'.format(
                    parental, str(split)))
                fout.write('    gid: normal.parental_{}\n'.format(parental))
                fout.write('    proportion: {}\n'.format(
                    str(proportion / num_splits)))
                fout.write('    split: {}\n'.format(str(split)))
                split_readnum = int(numpy.ceil(readnum / num_splits))
                fout.write('    readnum: {}\n'.format(str(split_readnum)))
                seed = random_int()
                fout.write('    seed: {}\n'.format(str(seed)))
        else:
            total_num_splits += 1
            fout.write('  normal_normal.parental_{}:\n'.format(parental))
            fout.write('    gid: normal.parental_{}\n'.format(parental))
            fout.write('    proportion: {}\n'.format(str(proportion)))
            fout.write('    readnum: {}\n'.format(str(readnum)))
            seed = random_int()
            fout.write('    seed: {}\n'.format(str(seed)))

    return total_num_splits
예제 #2
0
def main(progname=None):
    parser = argparse.ArgumentParser(
        description=
        'an all-in-one wrapper for NGS reads simulation for tumor samples',
        prog=progname if progname else sys.argv[0])
    group0 = parser.add_argument_group('Global arguments')
    group1 = parser.add_argument_group('Module vcf2fa arguments')
    group2 = parser.add_argument_group('Module phylovar arguments')
    group0.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        metavar='DIR',
                        help='output directory')
    group1.add_argument('-v',
                        '--vcf',
                        type=check_vcf,
                        required=True,
                        metavar='FILE',
                        help='a vcf file contains germline variants')
    group1.add_argument('-r',
                        '--reference',
                        type=check_file,
                        required=True,
                        metavar='FILE',
                        help='a fasta file of the reference genome')
    group2.add_argument('-t',
                        '--tree',
                        type=check_file,
                        required=True,
                        metavar='FILE',
                        help='a newick file contains ONE tree')
    group2.add_argument(
        '-c',
        '--config',
        type=check_file,
        required=True,
        metavar='FILE',
        help=
        'a YAML file which contains the configuration of somatic variant simulation'
    )
    group1.add_argument(
        '-a',
        '--autosomes',
        type=check_autosomes,
        required=True,
        metavar='STR',
        help='autosomes of the genome (e.g. 1,2,3,4,5 or 1..4,5)')
    default = None
    group2.add_argument(
        '--affiliation',
        type=check_file,
        default=default,
        metavar='FILE',
        help=
        'a file containing sector affiliation of the cells in the sample [{}]'.
        format(default))
    default = None
    group2.add_argument(
        '--cnvl_dist',
        type=check_file,
        default=default,
        metavar='FILE',
        help="a file containing the distribution profile of CNVs' length [{}]".
        format(default))
    default = 'WGS'
    group0.add_argument(
        '--type',
        type=str,
        default=default,
        choices=['WGS', 'WES', 'BOTH'],
        help='sequencing type to simulate [{}]'.format(default))
    default = 1
    group0.add_argument(
        '--cores',
        type=int,
        default=default,
        metavar='INT',
        help='number of cores used to run the program [{}]'.format(default))
    default = None
    group0.add_argument(
        '--random_seed',
        type=check_seed,
        default=default,
        metavar='INT',
        help=
        'the seed for random number generator (an integer between 0 and 2**31-1) [{}]'
        .format(default))
    default = 'allinone.log'
    group0.add_argument(
        '--log',
        type=str,
        default=default,
        metavar='FILE',
        help='the log file to save the settings of each command [{}]'.format(
            default))
    default = 1
    group0.add_argument(
        '--start',
        type=int,
        default=default,
        choices=[1, 2, 3, 4],
        help='the serial number of the module from which to start. \
            1: vcf2fa; 2: phylovar; 3: chain2fa; 4: fa2wgs/fa2wes [{}]'.format(
            default))
    default = None
    group1.add_argument(
        '-s',
        '--sex_chr',
        type=check_sex,
        default=default,
        metavar='STR',
        help='sex chromosomes of the genome (separated by comma) [{}]'.format(
            default))
    default = 0.05
    group2.add_argument(
        '-x',
        '--prune',
        type=check_prune,
        default=default,
        metavar='FLOAT',
        help=
        'trim all the children of the nodes with equal or less than this proportion of total leaves [{}]'
        .format(default))
    default = None
    group2.add_argument(
        '--trunk_vars',
        type=str,
        default=default,
        metavar='FILE',
        help='a file containing truncal variants predefined by user [{}]'.
        format(default))
    default = 0
    group2.add_argument('--trunk_length',
                        type=float,
                        default=default,
                        metavar='FLOAT',
                        help='the length of the trunk [{}]'.format(default))
    group3 = parser.add_argument_group('Arguments for module fa2wgs/fa2wes')
    default = 0.6
    group3.add_argument(
        '-p',
        '--purity',
        type=check_purity,
        default=default,
        metavar='FLOAT',
        help='the proportion of tumor cells in simulated tumor sample [{}]'.
        format(default))
    default = None
    group3.add_argument(
        '--sectors',
        type=check_file,
        default=default,
        metavar='FILE',
        help='the file contains purity and depth profile of each tumor sector. \
              After this setting, -d/-p will be ignored. [{}]'.format(default))
    default = 150
    group3.add_argument(
        '--rlen',
        type=int,
        default=default,
        metavar='INT',
        help="the length of reads to simulate [{}]".format(default))
    group3.add_argument('--separate',
                        action="store_true",
                        help="keep each tip node's NGS reads file separately")
    group3.add_argument(
        '--single',
        action="store_true",
        help=
        "single cell mode. After this setting, the value of --tumor_depth/--tumor_rdepth \
            is the depth of each tumor cell (not the total depth of tumor sample anymore)"
    )
    group4 = parser.add_argument_group('Module fa2wgs arguments')
    default = 50
    group4.add_argument(
        '-d',
        '--tumor_depth',
        type=check_depth,
        default=default,
        metavar='FLOAT',
        help=
        'the mean depth of tumor sample for fa2wgs to simulate NGS reads [{}]'.
        format(default))
    default = 0
    group4.add_argument(
        '-D',
        '--normal_depth',
        type=check_depth,
        default=default,
        metavar='FLOAT',
        help=
        'the mean depth of normal sample for fa2wgs to simulate NGS reads [{}]'
        .format(default))
    default = 'art_illumina --noALN --quiet --paired --mflen 500 --sdev 20'
    group4.add_argument(
        '--art',
        type=str,
        default=default,
        metavar='STR',
        help="the parameters for ART program ['{}']".format(default))
    group5 = parser.add_argument_group('Module fa2wes arguments')
    default = None
    group5.add_argument(
        '--probe',
        metavar='FILE',
        type=check_file,
        default=default,
        help='The file containing the probe sequences (FASTA format) [{}]'.
        format(default))
    default = None
    group5.add_argument(
        '--target',
        metavar='FILE',
        type=str,
        default=default,
        help='The Target file containing the target regions (BED format)')
    default = 0
    group5sub1 = group5.add_mutually_exclusive_group()
    group5sub2 = group5.add_mutually_exclusive_group()
    group5sub1.add_argument(
        '--tumor_rdepth',
        type=check_depth,
        default=default,
        metavar='FLOAT',
        help=
        'the mean depth of tumor sample for fa2wes to simulate NGS reads [{}]'.
        format(default))
    default = 0
    group5sub1.add_argument(
        '--tumor_rnum',
        metavar='INT',
        type=int,
        default=default,
        help='The number of short reads to simulate for tumor sample [{}]'.
        format(default))
    default = 0
    group5sub2.add_argument(
        '--normal_rdepth',
        type=check_depth,
        default=default,
        metavar='FLOAT',
        help=
        'The mean depth of normal sample for fa2wes to simulate NGS reads [{}]'
        .format(default))
    default = 0
    group5sub2.add_argument(
        '--normal_rnum',
        metavar='INT',
        type=int,
        default=default,
        help='The number of short reads to simulate for normal sample [{}]'.
        format(default))
    default = 'wessim'
    group5.add_argument(
        '--simulator',
        default=default,
        choices=['wessim', 'capgem'],
        action=TargetAction,
        help=
        'The whole-exome sequencing simulator used for simulating short reads [{}]'
        .format(default))
    default = RATIO_WESSIM
    group5.add_argument(
        '--ontarget_ratio',
        metavar='FLOAT',
        type=float,
        default=default,
        help=
        'The percentage that simulated reads are expected to be from the target regions. \
            It is dependent on the simulator. The default value is {} for wessim and {} for \
            capgem [{}]'.format(RATIO_WESSIM, RATIO_CAPGEM, default))
    default = None
    group5.add_argument(
        '--error_model',
        metavar='FILE',
        type=check_file,
        default=default,
        help=
        'The file containing the empirical error model for NGS reads generated by GemErr \
            (It must be provided when capgem or wessim is used for simulation) [{}]'
        .format(default))
    default = "snakemake --rerun-incomplete -k --latency-wait 120"
    group5.add_argument(
        '--snakemake',
        metavar='STR',
        type=check_snakemake,
        default=default,
        help=
        "The snakemake command used for calling a whole-exome sequencing simulator. \
            The Snakefile for a simulator is under the directory 'wes/config' of the source code. \
            Additional parameters for a simulator can be adjusted in the Snakefile ['{}']"
        .format(default))
    default = 2
    group5.add_argument(
        '--out_level',
        type=int,
        choices=[0, 1, 2],
        default=default,
        help=
        "The level used to indicate how many intermediate output files are kept. \
            Level 0: keep all the files. \
            Level 1: keep files that are necessary for rerunning simulation \
                     ('config', 'genome_index', 'mapping', 'merged', and 'separate'). \
            Level 2: keep only final results ('merged' and 'separate') [{}]".
        format(default))

    args = parser.parse_args()
    if args.prune and args.single:
        raise argparse.ArgumentTypeError(
            "Can not prune the tree in single cell mode! Set '--prune 0' if you want to simulate single cell data."
        )
    with open(args.config, 'r') as configfile:
        config = yaml.safe_load(configfile)
    check_config_file(config=config)
    if args.type in ['WES', 'BOTH']:
        if args.probe == None:
            raise argparse.ArgumentTypeError(
                "--probe is required to simulate WES data!")
        if args.target == None:
            raise argparse.ArgumentTypeError(
                "--target is required to simulate WES data!")
        if args.tumor_rdepth != 0 and args.tumor_rnum != 0:
            raise argparse.ArgumentTypeError(
                "--tumor_rdepth is not allowed to use together with --tumor_rnum!"
            )
        if args.normal_rdepth != 0 and args.normal_rnum != 0:
            raise argparse.ArgumentTypeError(
                "--normal_rdepth is not allowed to use together with --normal_rnum!"
            )
        check_program(args.simulator)

#get absolute paths for the input files
    reference = os.path.abspath(args.reference)
    vcf = os.path.abspath(args.vcf)
    tree = os.path.abspath(args.tree)
    config = os.path.abspath(args.config)
    if args.trunk_vars:
        trunk_vars = os.path.abspath(args.trunk_vars)
    if args.affiliation:
        affiliation = os.path.abspath(args.affiliation)
    if args.cnvl_dist:
        cnvl_dist = os.path.abspath(args.cnvl_dist)
    if args.sectors:
        sectors = os.path.abspath(args.sectors)
    outdir = args.output
    if args.start == 1:
        try:
            os.mkdir(outdir, mode=0o755)
        except FileExistsError as e:
            raise OutputExistsError(
                "'{}' already exists. Try another directory to output! (-o/--output)"
                .format(outdir)) from e
    else:
        assert os.path.isdir(outdir),"Couldn't start from step {}, ".format(args.start)+\
            "because I can not find the directory of previous results: '{}'.".format(outdir)
    os.chdir(outdir)

    ###### logging and random seed setting
    logging.basicConfig(filename=args.log if args.start == 1 else args.log +
                        '.start' + str(args.start),
                        filemode='w',
                        format='[%(asctime)s] %(levelname)s: %(message)s',
                        datefmt='%m-%d %H:%M:%S',
                        level='INFO')
    argv_copy = sys.argv[:]
    if '--art' in argv_copy:
        art_index = argv_copy.index('--art')
        argv_copy[art_index + 1] = "'{}'".format(argv_copy[art_index + 1])
    if '--snakemake' in argv_copy:
        snakemake_index = argv_copy.index('--snakemake')
        argv_copy[snakemake_index + 1] = "'{}'".format(
            argv_copy[snakemake_index + 1])
    argv_copy.insert(1, 'allinone')
    logging.info(' Command: %s', ' '.join(argv_copy))

    if args.random_seed == None:
        seed = random_int()
    else:
        seed = args.random_seed
    logging.info(' Random seed: %s', seed)
    numpy.random.seed(seed)

    #subfolders
    normal_fa = 'normal_fa'
    tumor_fa = 'tumor_fa'
    tumor_chain = 'tumor_chain'
    #map file
    map_dir = 'map'

    #vcf2fa
    if args.start < 2:
        cmd_params = [
            sys.argv[0], 'vcf2fa', '--vcf', vcf, '--reference', reference,
            '--output', normal_fa, '--autosomes', args.autosomes
        ]
        if args.sex_chr:
            cmd_params.extend(['--sex_chr', args.sex_chr])
        logging.info(' Command: %s', ' '.join(cmd_params))
        subprocess.run(args=cmd_params, check=True)

#phylovar
#I place random_int() here as I do not want to skip it in any situation.
#Without this, you can not replicate the result with different --start setting.
    random_n = random_int()
    if args.start < 3:
        if os.path.isdir(tumor_chain):
            shutil.rmtree(tumor_chain)
        elif os.path.isfile(tumor_chain):
            os.remove(tumor_chain)
        cmd_params = [
            sys.argv[0], 'phylovar', '--tree', tree, '--config', config,
            '--purity',
            str(args.purity), '--prune',
            str(args.prune), '--random_seed',
            str(random_n), '--map', map_dir, '--chain', tumor_chain
        ]
        if args.sex_chr:
            cmd_params.extend(['--sex_chr', args.sex_chr])
        if args.trunk_vars:
            cmd_params.extend(['--trunk_vars', trunk_vars])
        if args.affiliation:
            cmd_params.extend(['--affiliation', affiliation])
        if args.cnvl_dist:
            cmd_params.extend(['--cnvl_dist', cnvl_dist])
        if args.trunk_length:
            cmd_params.extend(['--trunk_length', str(args.trunk_length)])
        logging.info(' Command: %s', ' '.join(cmd_params))
        subprocess.run(args=cmd_params, check=True)

#chain2fa
    if args.start < 4:
        if os.path.isdir(tumor_fa):
            shutil.rmtree(tumor_fa)
        elif os.path.isfile(tumor_fa):
            os.remove(tumor_fa)

        cmd_params = [
            sys.argv[0], 'chain2fa', '--chain', tumor_chain, '--normal',
            ','.join([
                os.path.join(normal_fa, 'normal.parental_{}.fa'.format(x))
                for x in (0, 1)
            ]), '--cores',
            str(args.cores), '--output', tumor_fa
        ]
        logging.info(' Command: %s', ' '.join(cmd_params))
        subprocess.run(args=cmd_params, check=True)

#fa2wgs
    random_n = random_int()
    if args.type in ['WGS', 'BOTH']:
        reads_dir = 'wgs_reads'
        if os.path.isdir(reads_dir):
            shutil.rmtree(reads_dir)
        elif os.path.isfile(reads_dir):
            os.remove(reads_dir)
        cmd_params = [
            sys.argv[0], 'fa2wgs', '--normal', normal_fa, '--tumor', tumor_fa,
            '--map', map_dir, '--normal_depth',
            str(args.normal_depth), '--output', reads_dir, '--random_seed',
            str(random_n), '--cores',
            str(args.cores), '--rlen',
            str(args.rlen), '--art', args.art
        ]
        if args.sectors:
            cmd_params.extend(['--sectors', sectors])
        else:
            cmd_params.extend(['--tumor_depth', str(args.tumor_depth)])
            cmd_params.extend(['--purity', str(args.purity)])
        if args.single:
            cmd_params.extend(['--single'])
        cmd_params_copy = cmd_params[:]
        art_index = cmd_params_copy.index('--art')
        cmd_params_copy[art_index + 1] = "'{}'".format(
            cmd_params_copy[art_index + 1])
        logging.info(' Command: %s', ' '.join(cmd_params_copy))
        subprocess.run(args=cmd_params, check=True)


#fa2wes
    random_n = random_int()
    if args.type in ['WES', 'BOTH']:
        reads_dir = 'wes_reads'
        cmd_params = [
            sys.argv[0], 'fa2wes', '--normal', normal_fa, '--tumor', tumor_fa,
            '--map', map_dir, '--probe', args.probe, '--target', args.target,
            '--simulator', args.simulator, '--ontarget_ratio',
            str(args.ontarget_ratio), '--rlen',
            str(args.rlen), '--purity',
            str(args.purity), '--output', reads_dir, '--random_seed',
            str(random_n), '--cores',
            str(args.cores), '--out_level',
            str(args.out_level), '--snakemake', args.snakemake
        ]
        if args.sectors:
            cmd_params.extend(['--sectors', sectors])
        if args.tumor_rdepth:
            cmd_params.extend(['--tumor_rdepth', str(args.tumor_rdepth)])
        elif args.tumor_rnum:
            cmd_params.extend(['--tumor_rnum', str(args.tumor_rnum)])
        if args.normal_rdepth:
            cmd_params.extend(['--normal_rdepth', str(args.normal_rdepth)])
        elif args.normal_rnum:
            cmd_params.extend(['--normal_rnum', str(args.normal_rnum)])
        if args.error_model:
            cmd_params.extend(['--error_model', args.error_model])
        if args.separate:
            cmd_params.extend(['--separate'])
        if args.single:
            cmd_params.extend(['--single'])
        cmd_params_copy = cmd_params[:]
        snakemake_index = cmd_params_copy.index('--snakemake')
        snakemake_str = cmd_params_copy[snakemake_index + 1]
        if "'" in snakemake_str:
            snakemake_str = snakemake_str.replace("'", '"')
        cmd_params_copy[snakemake_index + 1] = "'{}'".format(snakemake_str)
        logging.info(' Command: %s', ' '.join(cmd_params_copy))
        subprocess.run(args=cmd_params, check=True)
예제 #3
0
파일: fa2wes.py 프로젝트: zorrodong/PSiTE
def main(progname=None):
    t0 = time.time()
    prog = progname if progname else sys.argv[0]
    parser = argparse.ArgumentParser(
        description=
        'a wrapper of simulating targeted capture sequencing from reference genome files',
        prog=prog)

    group1 = parser.add_argument_group('Input arguments')
    group1.add_argument(
        '-n',
        '--normal',
        metavar='DIR',
        type=check_folder,
        required=True,
        help='The directory of the fasta files of normal genomes')
    group1.add_argument(
        '-t',
        '--tumor',
        metavar='DIR',
        type=check_folder,
        required=True,
        help='The directory of the fasta files of tumor genomes')
    group1.add_argument(
        '-m',
        '--map',
        type=check_folder,
        required=True,
        metavar='DIR',
        help=
        'The directory of map files, which contains the relationship between tip nodes and samples'
    )
    default = None
    group1.add_argument(
        '-s',
        '--sectors',
        type=check_file,
        default=default,
        metavar='FILE',
        help=
        'The file containing purity and depth profile of each tumor sector. \
              After this setting, -d/-D/-p will be ignored [{}]'.format(
            default))
    group1.add_argument(
        '--probe',
        metavar='FILE',
        type=check_file,
        required=True,
        help='The Probe file containing the probe sequences (FASTA format)')
    group1.add_argument(
        '--target',
        metavar='FILE',
        type=check_file,
        required=True,
        help='The Target file containing the target regions (BED format)')
    default = None
    group1.add_argument(
        '--error_model',
        metavar='FILE',
        type=check_file,
        help=
        'The file containing the empirical error model for NGS reads generated by GemErr (It must be provided when capgem or wessim is used for simulation) [{}]'
        .format(default))

    group2 = parser.add_argument_group('Arguments for simulation')
    default = 0.6
    group2.add_argument(
        '-p',
        '--purity',
        metavar='FLOAT',
        type=check_purity,
        default=default,
        help='The proportion of tumor cells in simulated sample [{}]'.format(
            default))
    default = 150
    group2.add_argument('--rlen',
                        metavar='INT',
                        type=int,
                        default=default,
                        help='Illumina: read length [{}]'.format(default))
    group2.add_argument('--single_end',
                        action='store_true',
                        help='Simulating single-end reads')
    group = group2.add_mutually_exclusive_group()
    default = 0
    group.add_argument(
        '-d',
        '--tumor_rdepth',
        metavar='FLOAT',
        type=check_depth,
        default=default,
        help='The mean depth of tumor sample for simulating short reads [{}]'.
        format(default))
    default = 0
    group.add_argument(
        '-r',
        '--tumor_rnum',
        metavar='INT',
        type=int,
        default=default,
        help='The number of short reads to simulate for tumor sample [{}]'.
        format(default))
    group = group2.add_mutually_exclusive_group()
    default = 0
    group.add_argument(
        '-D',
        '--normal_rdepth',
        metavar='FLOAT',
        type=check_depth,
        default=default,
        help='The mean depth of normal sample for simulating short reads [{}]'.
        format(default))
    default = 0
    group.add_argument(
        '-R',
        '--normal_rnum',
        metavar='INT',
        type=int,
        default=default,
        help='The number of short reads to simulate for normal sample [{}]'.
        format(default))
    default = None
    group2.add_argument(
        '--random_seed',
        metavar='INT',
        type=check_seed,
        help='The seed for random number generator [{}]'.format(default))
    default = 'wessim'
    group2.add_argument(
        '--simulator',
        default=default,
        choices=['wessim', 'capgem'],
        action=TargetAction,
        type=check_program,
        help=
        'The whole-exome sequencing simulator used for simulating short reads [{}]'
        .format(default))
    default = RATIO_WESSIM
    group2.add_argument(
        '--ontarget_ratio',
        metavar='FLOAT',
        type=float,
        default=default,
        help=
        'The percentage that simulated reads are expected to be from the target regions. It is dependent on the simulator. The default value is {} for wessim and {} for capgem [{}]'
        .format(RATIO_WESSIM, RATIO_CAPGEM, default))
    group2.add_argument(
        '--single',
        action='store_true',
        help=
        'single cell mode. After this setting, -p will be ignored and the value of --tumor_rdepth and --tumor_rnum are for each tumor cell (not the whole tumor sample anymore)'
    )
    default = "snakemake --rerun-incomplete -k --latency-wait 120"
    group2.add_argument(
        '--snakemake',
        metavar='STR',
        type=check_snakemake,
        default=default,
        help=
        "The snakemake command used for calling a whole-exome sequencing simulator. The Snakefile for a simulator is under the directory 'wes/config' of the source code. Additional parameters for a simulator can be adjusted in the Snakefile ['{}']"
        .format(default))
    default = 1
    group2.add_argument(
        '--cores',
        type=int,
        default=default,
        metavar='INT',
        help=
        "The number of cores used to run the program (including snakemake). If '--cores' or '--jobs' or '-j' is specified in the options of snakemake, the value specified by '--cores' here will be ignored when snakemake is called [{}]"
        .format(default))

    group3 = parser.add_argument_group('Output arguments')
    default = 'wes_reads'
    group3.add_argument('-o',
                        '--output',
                        metavar='DIR',
                        type=str,
                        default=default,
                        help='The output directory [{}]'.format(default))
    default = 'fa2wes.log'
    group3.add_argument(
        '-g',
        '--log',
        metavar='FILE',
        type=str,
        default=default,
        help='The log file to save the settings of each command [{}]'.format(
            default))
    default = 2
    group3.add_argument(
        '--out_level',
        type=int,
        choices=[0, 1, 2],
        default=default,
        help=
        "The level used to indicate how many intermediate output files are kept. \
                       Level 0: keep all the files.\
                       Level 1: keep files that are necessary for rerunning simulation ('config', 'genome_index', 'mapping', 'merged', and 'separate'). \
                       Level 2: keep only final results ('merged' and 'separate') [{}]"
        .format(default))
    group3.add_argument('--separate',
                        action='store_true',
                        help='Output the reads of each genome separately')

    args = parser.parse_args()
    check_normal_fa(args.normal)

    # logging and random seed setting
    logging.basicConfig(filename=args.log,
                        filemode='w',
                        format='[%(asctime)s] %(levelname)s: %(message)s',
                        datefmt='%m-%d %H:%M:%S',
                        level='INFO')
    argv_copy = sys.argv[:]
    try:
        snakemake_index = argv_copy.index('--snakemake')
        # Single quotes are required for the snakemake command
        snakemake_str = argv_copy[snakemake_index + 1]
        if "'" in snakemake_str:
            snakemake_str = snakemake_str.replace("'", '"')
        argv_copy[snakemake_index + 1] = "'{}'".format(snakemake_str)
    except ValueError:
        pass
    argv_copy.insert(1, 'fa2wes')
    logging.info(' Command: %s', ' '.join(argv_copy))

    if args.random_seed == None:
        seed = random_int()
    else:
        seed = args.random_seed
    logging.info(' Ontarget ratio: %s', str(args.ontarget_ratio))
    logging.info(' Random seed: %d', seed)
    numpy.random.seed(seed)

    # Create output folders
    if os.path.exists(args.output):
        if os.path.isdir(args.output):
            pass
        else:
            raise OutputExistsError(
                "A file in the name of '{}' exists.\nDelete it or try another name as output folder."
                .format(args.output))
    else:
        os.makedirs(args.output, mode=0o755)

    if args.single_end:
        rlen = args.rlen
    else:
        rlen = args.rlen * 2

    wes_dir = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
                           'wes')
    # Add path variables
    if args.simulator == 'capsim':  # Not exposed to user for simplificity
        snake_file = os.path.join(wes_dir, 'config/Snakefile_capsim')
    elif args.simulator == 'wessim':
        snake_file = os.path.join(wes_dir, 'config/Snakefile_wessim')
        wessim_dir = os.path.join(wes_dir, 'wessim')
        os.environ['PATH'] += os.pathsep + wessim_dir
    else:  # capgem
        snake_file = os.path.join(wes_dir, 'config/Snakefile_capgem')
        capgem_dir = os.path.join(wes_dir, 'capgem')
        if os.path.exists(os.path.join(capgem_dir, 'bin')):
            os.environ['PATH'] += os.pathsep + os.path.join(capgem_dir, 'bin')
        os.environ['PATH'] += os.pathsep + os.path.join(capgem_dir, 'src')
        # Ensure that capsim is installed
        prog = 'capsim'
        if shutil.which(prog) is None:
            raise argparse.ArgumentTypeError(
                "Cannot find program '{}'. Please ensure that you have installed it!"
                .format(prog))
    assert os.path.isfile(
        snake_file
    ), 'Cannot find Snakefile {} under the program directory:\n'.format(
        snake_file)

    normal_gsize = compute_normal_gsize(args.normal)
    target_size = compute_target_size(args.target)
    logging.info(' Size of target region: %s bp', str(target_size))

    # Simulate normal and tumor sample at the same time
    if (args.tumor_rdepth > 0
            or args.tumor_rnum > 0) and (args.normal_rdepth > 0
                                         or args.normal_rnum > 0):
        sectors = parse_sectors(args)
        check_tumor_fa(args.tumor, sectors, args.simulator)

        outdir = os.path.abspath(args.output)
        configdir = os.path.join(outdir, 'config')
        if not os.path.exists(configdir):
            os.makedirs(configdir)

        sample_file = os.path.join(outdir, 'config/sample.yaml')
        total_num_splits = prepare_yaml_all(sample_file, rlen, args, sectors,
                                            normal_gsize, target_size)
        logging.info(' Number of splits in simulation: %d', total_num_splits)

        run_snakemake(outdir, args, sample_file, snake_file)
        merge_normal_sample(args, outdir)
        merge_tumor_sample(args, sectors, outdir)
        clean_output(args.out_level, outdir)

    # Separate the simulation of tumor and normal samples
    elif args.tumor_rdepth > 0 or args.tumor_rnum > 0:
        sectors = parse_sectors(args)
        check_tumor_fa(args.tumor, sectors, args.simulator)

        outdir = os.path.join(os.path.abspath(args.output), "tumor")
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        configdir = os.path.join(outdir, 'config')
        if not os.path.exists(configdir):
            os.makedirs(configdir)

        sample_file = os.path.join(outdir, 'config/sample.yaml')
        total_num_splits = prepare_yaml_tumor(sample_file, rlen, args, sectors,
                                              normal_gsize, target_size)
        logging.info(' Number of splits in simulation: %d', total_num_splits)

        run_snakemake(outdir, args, sample_file, snake_file)
        merge_tumor_sample(args, sectors, outdir)
        clean_output(args.out_level, outdir)

    elif args.normal_rdepth > 0 or args.normal_rnum > 0:
        outdir = os.path.join(os.path.abspath(args.output), 'normal')
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        configdir = os.path.join(outdir, 'config')
        if not os.path.exists(configdir):
            os.makedirs(configdir)

        sample_file = os.path.join(outdir, 'config/sample.yaml')
        total_num_splits = prepare_yaml_normal(sample_file, rlen, args,
                                               normal_gsize, target_size)
        logging.info(' Number of splits in simulation: %d', total_num_splits)

        run_snakemake(outdir, args, sample_file, snake_file)
        merge_normal_sample(args, outdir)
        clean_output(args.out_level, outdir)
    else:
        logging.info('Please specify sequening depth!')

    t1 = time.time()
    print("Total time running {}: {} seconds".format(prog, str(t1 - t0)))
예제 #4
0
파일: fa2wes.py 프로젝트: zorrodong/PSiTE
def write_sample_tumor(fout, rlen, args, sectors, normal_gsize, target_size):
    total_num_splits = 0
    for sector in sorted(sectors.keys()):
        tipnode_leaves = sectors[sector]['composition']
        if not args.single:
            tumor_cells = sum(tipnode_leaves.values())
            purity = sectors[sector]['purity']
            total_cells = tumor_cells / purity
            logging.info(
                ' Number of total cells in tumor sample {}: {:.2f}'.format(
                    sector, total_cells))
            normal_cells = total_cells - tumor_cells
            logging.info(
                ' Number of normal cells in tumor sample {}: {:.2f}'.format(
                    sector, normal_cells))
        # normal_dna = normal_gsize * normal_cells
        tipnode_gsize, tumor_dna = compute_tumor_dna(args.tumor,
                                                     tipnode_leaves)
        # total_dna = (normal_dna + tumor_dna)
        depth = sectors[sector]['depth']
        if depth > 0:
            total_rnum = int(
                (depth * target_size) / (rlen * args.ontarget_ratio))
        else:
            total_rnum = args.tumor_rnum
        logging.info(
            ' Total number of reads to simulate for tumor sample {}: {}'.
            format(sector, total_rnum))
        MAX_READNUM = int(total_rnum * MAX_READFRAC)

        # two normal cell haplotypes, only generated under non-single mode
        if not args.single:
            for parental in 0, 1:
                ref = '{}/normal.parental_{}.fa'.format(args.normal, parental)
                fullname = os.path.abspath(ref)
                cell_proportion = normal_cells / total_cells
                proportion = cell_proportion * \
                    genomesize(fasta=ref) / normal_gsize
                readnum = int(proportion * total_rnum)
                if readnum > MAX_READNUM:
                    num_splits = int(numpy.ceil(readnum / MAX_READNUM))
                    total_num_splits += num_splits
                    for split in range(1, num_splits + 1):
                        fout.write('  {}_normal.parental_{}_{}:\n'.format(
                            sector, parental, str(split)))
                        fout.write(
                            '    gid: normal.parental_{}\n'.format(parental))
                        fout.write('    cell_proportion: {}\n'.format(
                            str(cell_proportion)))
                        fout.write('    proportion: {}\n'.format(
                            str(proportion / num_splits)))
                        fout.write('    split: {}\n'.format(str(split)))
                        split_readnum = int(numpy.ceil(readnum / num_splits))
                        fout.write('    readnum: {}\n'.format(
                            str(split_readnum)))
                        seed = random_int()
                        fout.write('    seed: {}\n'.format(str(seed)))
                else:
                    total_num_splits += 1
                    fout.write('  {}_normal.parental_{}:\n'.format(
                        sector, parental))
                    fout.write(
                        '    gid: normal.parental_{}\n'.format(parental))
                    fout.write('    cell_proportion: {}\n'.format(
                        str(cell_proportion)))
                    fout.write('    proportion: {}\n'.format(str(proportion)))
                    fout.write('    readnum: {}\n'.format(str(readnum)))
                    seed = random_int()
                    fout.write('    seed: {}\n'.format(str(seed)))

        # tumor cells haplotypes
        for tipnode in sorted(tipnode_leaves.keys()):
            for parental in 0, 1:
                ref = '{}/{}.parental_{}.fa'.format(args.tumor, tipnode,
                                                    parental)
                fullname = os.path.abspath(ref)
                if args.single:
                    cell_proportion = 1
                else:
                    cell_proportion = tipnode_leaves[tipnode] / total_cells
                proportion = cell_proportion * \
                    tipnode_gsize[tipnode][parental] / \
                    tipnode_gsize[tipnode][2]
                readnum = int(proportion * total_rnum)
                if readnum > MAX_READNUM:
                    num_splits = int(numpy.ceil(readnum / MAX_READNUM))
                    total_num_splits += num_splits
                    for split in range(1, num_splits + 1):
                        fout.write('  {}_{}.parental_{}_{}:\n'.format(
                            sector, tipnode, parental, str(split)))
                        fout.write('    gid: {}.parental_{}\n'.format(
                            tipnode, parental))
                        fout.write('    proportion: {}\n'.format(
                            str(proportion / num_splits)))
                        fout.write('    split: {}\n'.format(str(split)))
                        split_readnum = int(numpy.ceil(readnum / num_splits))
                        fout.write('    readnum: {}\n'.format(
                            str(split_readnum)))
                        seed = random_int()
                        fout.write('    seed: {}\n'.format(str(seed)))
                else:
                    total_num_splits += 1
                    fout.write('  {}_{}.parental_{}:\n'.format(
                        sector, tipnode, parental))
                    fout.write('    gid: {}.parental_{}\n'.format(
                        tipnode, parental))
                    fout.write('    cell_proportion: {}\n'.format(
                        str(cell_proportion)))
                    fout.write('    proportion: {}\n'.format(str(proportion)))
                    fout.write('    readnum: {}\n'.format(str(readnum)))
                    seed = random_int()
                    fout.write('    seed: {}\n'.format(str(seed)))
    return total_num_splits
예제 #5
0
파일: fa2wgs.py 프로젝트: zorrodong/PSiTE
def main(progname=None):
    t0 = time.time()
    prog = progname if progname else sys.argv[0]
    parser=argparse.ArgumentParser(
        description='A wrapper of simulating WGS reads from normal and tumor genome fasta',
        prog=prog)
    group1 = parser.add_argument_group('Input arguments')
    group1.add_argument('-n','--normal',type=check_folder,required=True,metavar='DIR',
        help='the directory of the normal fasta')
    group1.add_argument('-t','--tumor',type=check_folder,required=True,metavar='DIR',
        help='the directory of the tumor fasta')
    group1.add_argument('-m','--map',type=check_folder,required=True,metavar='DIR',
        help='the directory of map files, which contains the relationship between tip nodes and samples')
    default=None
    group1.add_argument('-s','--sectors',type=check_file,default=default,metavar='FILE',
        help='the file containing purity and depth profile of each tumor sector. \
              After this setting, -d/-p will be ignored [{}]'.format(default))
    group2 = parser.add_argument_group('Arguments for simulation')
    default=50
    group2.add_argument('-d','--tumor_depth',type=check_depth,default=default,metavar='FLOAT',
        help='the mean depth of tumor sample for ART to simulate WGS reads [{}]'.format(default))
    default=0
    group2.add_argument('-D','--normal_depth',type=check_depth,default=default,metavar='FLOAT',
        help='the mean depth of normal sample for ART to simulate WGS reads [{}]'.format(default))
    default=0.6
    group2.add_argument('-p','--purity',type=check_purity,default=default,metavar='FLOAT',
        help='the proportion of tumor cells in simulated tumor sample [{}]'.format(default))
    default=None
    group2.add_argument('--random_seed',type=check_seed,metavar='INT',
        help='the seed for random number generator [{}]'.format(default))
    default=150
    group2.add_argument('--rlen',type=int,default=default,metavar='INT',
        help="the length of reads to simulate [{}]".format(default))
    default='art_illumina --noALN --quiet --paired --mflen 500 --sdev 20'
    group2.add_argument('--art',type=str,default=default,metavar='STR',
        help="the parameters for ART program ['{}']".format(default))
    default=1
    group2.add_argument('--cores',type=int,default=default,metavar='INT',
        help='number of cores used to run the program [{}]'.format(default))
    group2.add_argument('--separate',action="store_true",
        help="keep each tip node's WGS reads file separately")
    group2.add_argument('--single',action="store_true",
        help="single cell mode. "+\
        "After this setting,  -p will be ignored and the value of --tumor_depth is the depth of each tumor cell "+\
        "(not the total depth of tumor sample anymore).")
    group3 = parser.add_argument_group('Output arguments')
    default='art_reads'
    group3.add_argument('-o','--output',type=str,default=default,metavar='DIR',
        help='output directory [{}]'.format(default))
    default='fa2wgs.log'
    group3.add_argument('-g','--log',type=str,default=default,metavar='FILE',
        help='the log file to save the settings of each command [{}]'.format(default))
    args=parser.parse_args()

#always compress the simulated fastq files
    compress=True

#logging and random seed setting
    logging.basicConfig(filename=args.log,
        filemode='w',format='[%(asctime)s] %(levelname)s: %(message)s',
        datefmt='%m-%d %H:%M:%S',level='INFO')
    argv_copy=sys.argv[:]
    if '--art' in argv_copy:
        art_index=argv_copy.index('--art')
        argv_copy[art_index+1]="'{}'".format(argv_copy[art_index+1])
    argv_copy.insert(1,'fa2wgs')
    logging.info(' Command: %s',' '.join(argv_copy))
    if args.random_seed==None:
        seed=random_int()
    else:
        seed=args.random_seed
    logging.info(' Random seed: %s',seed)
    numpy.random.seed(seed)

#construct the sectors dictionary to store the meta information of all tumor sectors
    sectors={}
    if args.sectors!=None:
        sectors=read_sectors_file(f=args.sectors)
        for sector in sectors:
            mapfile=os.path.join(args.map,'{}.tipnode.map'.format(sector))
            assert os.path.isfile(mapfile),\
                "Couldn't find the map file ({}.tipnode.map) for sector '{}' ".format(sector,sector)+\
                "under the map directory ({}).".format(os.path.abspath(args.map))
    else:
        mapfiles=glob.glob(os.path.join(args.map,'*.tipnode.map'))
        infered_sectors=['.'.join(os.path.basename(x).split('.')[:-2]) for x in mapfiles]
        for sector in infered_sectors:
            sectors[sector]={'purity':args.purity,'depth':args.tumor_depth}
    for sector in sectors:
        mapfile=os.path.join(args.map,'{}.tipnode.map'.format(sector))
        sectors[sector]['composition']=tipnode_leaves_counting(f=mapfile)

#exit the program if you do NOT want to simulate any reads for normal and tumor samples
    if args.normal_depth==0:
        for sector in sectors:
            if sectors[sector]['depth']!=0:
                break
        else:
            sys.exit('Do nothing as the depth for each sample is 0!')

#single cell mode or bulk tumor mode
    if args.single:
        for sector in sectors:
            for tipnode,leaves_n in sectors[sector]['composition'].items():
                assert leaves_n==1,\
                    'In single mode, each tip node should represent only one cell.\n'+\
                    'But {} leaves are found underneath tipnode {} in one of your map files!'.format(leaves_n,tipnode)

#create index file (.fai) for each fasta
    pool=multiprocessing.Pool(processes=args.cores)
    tipnodes=set()
    for sector in sectors:
        tipnodes=tipnodes.union(set(sectors[sector]['composition'].keys()))
    results=[]
    for parental in 0,1:
        fasta=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental))
        assert os.path.isfile(fasta),\
            "Couldn't find {} under the normal directory: {}".format(fasta,args.normal)
        results.append(pool.apply_async(build_fai,args=(fasta,)))
        for tipnode in tipnodes:
            fasta=os.path.join(args.tumor,'{}.parental_{}.fa'.format(tipnode,parental))
            assert os.path.isfile(fasta),\
                "Couldn't find {} under the tumor directory: {}".format(fasta,args.tumor)
            results.append(pool.apply_async(build_fai,args=(fasta,)))
    pool.close()
    pool.join()
    for result in results:
        result.get()

#create output folders
    if os.path.exists(args.output):
        if os.path.isdir(args.output):
            pass
        else:
            raise OutputExistsError("A FILE in the name of '{}' exists.\nDelete it or try another name as output folder.".format(args.output))
    else:
        os.mkdir(args.output,mode=0o755)
    normal_dir=os.path.join(args.output,'normal')
    if args.normal_depth>0:
        try:
            os.mkdir(normal_dir,mode=0o755)
        except FileExistsError as e:
            raise OutputExistsError("'{}' exists already! \nCan not use it as the output folder of normal WGS reads.".format(normal_dir)+
                '\nDelete it or use another folder as output folder.') from e

#collect simulation parameters first
    params_matrix=[]
    total_sim_bases=0
    art_params=args.art

#collect genome size for each genome
    normal_gsize=0
    for parental in 0,1:
        normal_gsize+=genomesize(fasta=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental)))
    tipnode_gsize={}
    for tipnode in tipnodes:
#The value of tipnode_gsize[tipnode] is a list of three elements:
#0)genomesize of parental 0
#1)genomesize of parental 1
#2)the sum of parental 0 and 1
        tipnode_gsize[tipnode]=[]
        for parental in 0,1:
            tipnode_gsize[tipnode].append(genomesize(fasta=os.path.join(args.tumor,'{}.parental_{}.fa'.format(tipnode,parental))))
        tipnode_gsize[tipnode].append(tipnode_gsize[tipnode][0]+tipnode_gsize[tipnode][1])

#simulation for normal sample
    if args.normal_depth>0:
        for parental in 0,1:
            prefix=os.path.join(normal_dir,'normal.parental_{}.'.format(parental))
            fcov=args.normal_depth/2
            ref=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental))
            sim_cfg={
                'gsize':normal_gsize/2,
                'base_cmd':art_params,
                'rlen':args.rlen,
                'fcov':fcov,
                'in':ref,
                'out':prefix,
                'id':'nm_prt{}'.format(parental)}
            params_matrix.append(sim_cfg)
            total_sim_bases+=sim_cfg['gsize']*sim_cfg['fcov']

#simulation for tumor sample
    for sector in sorted(sectors.keys()):
        if sectors[sector]['depth']>0:
#compute coverage and run ART
            sector_dir=os.path.join(args.output,sector)
            try:
                os.mkdir(sector_dir,mode=0o755)
            except FileExistsError as e:
                raise OutputExistsError("'{}' exists already! \nCan not use it as the output folder of tumor WGS reads.".format(sector_dir)+
                    '\nDelete it or use another folder as output folder.') from e

            tipnode_leaves=sectors[sector]['composition']
            sector_sim_bases=normal_gsize/2*sectors[sector]['depth']
            tumor_cells=sum(tipnode_leaves.values())
            total_cells=tumor_cells/sectors[sector]['purity']
            normal_cells=total_cells-tumor_cells
            normal_dna=normal_gsize*normal_cells
            tumor_dna=0
            for tipnode,leaves_n in tipnode_leaves.items():
                tumor_dna+=tipnode_gsize[tipnode][2]*leaves_n
            mean_depth_per_base=sector_sim_bases/(normal_dna+tumor_dna)

#two normal cell haplotypes
            if not args.single:
                for parental in 0,1:
                    prefix=os.path.join(sector_dir,'normal.parental_{}.'.format(parental))
                    fcov=normal_cells*mean_depth_per_base
                    ref=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental))
                    sim_cfg={
                        'gsize':normal_gsize/2,
                        'base_cmd':art_params,
                        'rlen':args.rlen,
                        'fcov':fcov,
                        'in':ref,
                        'out':prefix,
                        'id':'nm_prt{}'.format(parental)}
                    params_matrix.append(sim_cfg)
                    total_sim_bases+=sim_cfg['gsize']*sim_cfg['fcov']

#tumor cells haplotypes
            for tipnode in sorted(tipnode_leaves.keys()):
                fcov=None
                if args.single:
                    fcov=sector_sim_bases/tipnode_gsize[tipnode][2]
                else:
                    fcov=tipnode_leaves[tipnode]*mean_depth_per_base
                for parental in 0,1:
                    ref=os.path.join(args.tumor,'{}.parental_{}.fa'.format(tipnode,parental))
                    prefix=os.path.join(sector_dir,'{}.parental_{}.'.format(tipnode,parental))
                    sim_cfg={
                        'gsize':tipnode_gsize[tipnode][parental],
                        'base_cmd':art_params,
                        'rlen':args.rlen,
                        'fcov':fcov,
                        'in':ref,
                        'out':prefix,
                        'id':'{}_prt{}'.format(tipnode,parental)}
                    params_matrix.append(sim_cfg)
                    total_sim_bases+=sim_cfg['gsize']*sim_cfg['fcov']

#generate fastq and compress them parallelly
#every thread will generate at most 2 percent of the total data you want to simulate
#In order to let users replicate the results (with same random seed) even using different number of cores,
#I use the fixed size of block to parallelize the program.
    assert total_sim_bases>0,'The genome sizes of all cells in the sample is 0!'
    sizeBlock=total_sim_bases*0.02
    final_params_matrix=[]
    for cfg in params_matrix:
        n=math.ceil(cfg['gsize']*cfg['fcov']/sizeBlock)
        if n==0:
            continue
        cfg['fcov']=round(cfg['fcov']/n,6)
        for i in range(n):
            final_params_matrix.append(cfg.copy())
            final_params_matrix[-1]['out']=cfg['out']+'{:03d}.'.format(i)
            final_params_matrix[-1]['id']=cfg['id']+'_{:03d}-'.format(i)
            final_params_matrix[-1]['rndSeed']=str(random_int())
    pool=multiprocessing.Pool(processes=args.cores)
    results=[]
    for x in final_params_matrix:
        results.append(pool.apply_async(generate_fq,args=(x,compress)))
    pool.close()
    pool.join()
    for result in results:
        result.get()

#merge small fastq files into one fastq for normal/tumor sample
    sample_fq_files=[]
    suffixes=['fq','1.fq','2.fq']
    if compress:
        suffixes=[x+'.gz' for x in suffixes]
    if args.normal_depth>0:
        for suffix in suffixes:
            prefix=os.path.join(normal_dir,'normal.parental_[01].[0-9][0-9][0-9].')
            source=glob.glob(prefix+suffix)
            if len(source):
                target=os.path.join(normal_dir,'normal.{}'.format(suffix))
                source.sort()
                sample_fq_files.append([target,source])
    for sector in sorted(sectors.keys()):
        if sectors[sector]['depth']>0:
            sector_dir=os.path.join(args.output,sector)
            tipnode_leaves=sectors[sector]['composition']
            for suffix in suffixes:
                if args.single or args.separate:
                    for tipnode in ['normal']+sorted(tipnode_leaves.keys()):
                        prefix=os.path.join(sector_dir,'{}.parental_[01].[0-9][0-9][0-9].'.format(tipnode))
                        source=glob.glob(prefix+suffix)
                        if len(source):
                            target=os.path.join(sector_dir,'{}.{}'.format(tipnode,suffix))
                            source.sort()
                            sample_fq_files.append([target,source])
                else:
                    prefix=os.path.join(sector_dir,'*.parental_[01].[0-9][0-9][0-9].')
                    source=glob.glob(prefix+suffix)
                    if len(source):
                        target=os.path.join(sector_dir,'{}.{}'.format(sector,suffix))
                        source.sort()
                        sample_fq_files.append([target,source])
    pool=multiprocessing.Pool(processes=args.cores)
    results=[]
    for x in sample_fq_files:
        results.append(pool.apply_async(merge_fq,args=x))
    pool.close()
    pool.join()
    for result in results:
        result.get()

    t1 = time.time()
    print ("Total time running {}: {} seconds".format
       (prog, str(t1-t0)))