def write_sample_normal(fout, rlen, args, normal_gsize, target_size): total_num_splits = 0 if args.normal_rdepth > 0: total_rnum = int( (args.normal_rdepth * target_size) / (rlen * args.ontarget_ratio)) else: total_rnum = args.normal_rnum logging.info(' Total number of reads to simulate for normal sample: %d', total_rnum) MAX_READNUM = int(total_rnum * MAX_READFRAC) # two normal cell haplotypes for parental in 0, 1: ref = '{}/normal.parental_{}.fa'.format(args.normal, parental) proportion = genomesize(fasta=ref) / normal_gsize readnum = int(proportion * total_rnum) if readnum > MAX_READNUM: num_splits = int(numpy.ceil(readnum / MAX_READNUM)) total_num_splits += num_splits for split in range(1, num_splits + 1): fout.write(' normal_normal.parental_{}_{}:\n'.format( parental, str(split))) fout.write(' gid: normal.parental_{}\n'.format(parental)) fout.write(' proportion: {}\n'.format( str(proportion / num_splits))) fout.write(' split: {}\n'.format(str(split))) split_readnum = int(numpy.ceil(readnum / num_splits)) fout.write(' readnum: {}\n'.format(str(split_readnum))) seed = random_int() fout.write(' seed: {}\n'.format(str(seed))) else: total_num_splits += 1 fout.write(' normal_normal.parental_{}:\n'.format(parental)) fout.write(' gid: normal.parental_{}\n'.format(parental)) fout.write(' proportion: {}\n'.format(str(proportion))) fout.write(' readnum: {}\n'.format(str(readnum))) seed = random_int() fout.write(' seed: {}\n'.format(str(seed))) return total_num_splits
def main(progname=None): parser = argparse.ArgumentParser( description= 'an all-in-one wrapper for NGS reads simulation for tumor samples', prog=progname if progname else sys.argv[0]) group0 = parser.add_argument_group('Global arguments') group1 = parser.add_argument_group('Module vcf2fa arguments') group2 = parser.add_argument_group('Module phylovar arguments') group0.add_argument('-o', '--output', type=str, required=True, metavar='DIR', help='output directory') group1.add_argument('-v', '--vcf', type=check_vcf, required=True, metavar='FILE', help='a vcf file contains germline variants') group1.add_argument('-r', '--reference', type=check_file, required=True, metavar='FILE', help='a fasta file of the reference genome') group2.add_argument('-t', '--tree', type=check_file, required=True, metavar='FILE', help='a newick file contains ONE tree') group2.add_argument( '-c', '--config', type=check_file, required=True, metavar='FILE', help= 'a YAML file which contains the configuration of somatic variant simulation' ) group1.add_argument( '-a', '--autosomes', type=check_autosomes, required=True, metavar='STR', help='autosomes of the genome (e.g. 1,2,3,4,5 or 1..4,5)') default = None group2.add_argument( '--affiliation', type=check_file, default=default, metavar='FILE', help= 'a file containing sector affiliation of the cells in the sample [{}]'. format(default)) default = None group2.add_argument( '--cnvl_dist', type=check_file, default=default, metavar='FILE', help="a file containing the distribution profile of CNVs' length [{}]". format(default)) default = 'WGS' group0.add_argument( '--type', type=str, default=default, choices=['WGS', 'WES', 'BOTH'], help='sequencing type to simulate [{}]'.format(default)) default = 1 group0.add_argument( '--cores', type=int, default=default, metavar='INT', help='number of cores used to run the program [{}]'.format(default)) default = None group0.add_argument( '--random_seed', type=check_seed, default=default, metavar='INT', help= 'the seed for random number generator (an integer between 0 and 2**31-1) [{}]' .format(default)) default = 'allinone.log' group0.add_argument( '--log', type=str, default=default, metavar='FILE', help='the log file to save the settings of each command [{}]'.format( default)) default = 1 group0.add_argument( '--start', type=int, default=default, choices=[1, 2, 3, 4], help='the serial number of the module from which to start. \ 1: vcf2fa; 2: phylovar; 3: chain2fa; 4: fa2wgs/fa2wes [{}]'.format( default)) default = None group1.add_argument( '-s', '--sex_chr', type=check_sex, default=default, metavar='STR', help='sex chromosomes of the genome (separated by comma) [{}]'.format( default)) default = 0.05 group2.add_argument( '-x', '--prune', type=check_prune, default=default, metavar='FLOAT', help= 'trim all the children of the nodes with equal or less than this proportion of total leaves [{}]' .format(default)) default = None group2.add_argument( '--trunk_vars', type=str, default=default, metavar='FILE', help='a file containing truncal variants predefined by user [{}]'. format(default)) default = 0 group2.add_argument('--trunk_length', type=float, default=default, metavar='FLOAT', help='the length of the trunk [{}]'.format(default)) group3 = parser.add_argument_group('Arguments for module fa2wgs/fa2wes') default = 0.6 group3.add_argument( '-p', '--purity', type=check_purity, default=default, metavar='FLOAT', help='the proportion of tumor cells in simulated tumor sample [{}]'. format(default)) default = None group3.add_argument( '--sectors', type=check_file, default=default, metavar='FILE', help='the file contains purity and depth profile of each tumor sector. \ After this setting, -d/-p will be ignored. [{}]'.format(default)) default = 150 group3.add_argument( '--rlen', type=int, default=default, metavar='INT', help="the length of reads to simulate [{}]".format(default)) group3.add_argument('--separate', action="store_true", help="keep each tip node's NGS reads file separately") group3.add_argument( '--single', action="store_true", help= "single cell mode. After this setting, the value of --tumor_depth/--tumor_rdepth \ is the depth of each tumor cell (not the total depth of tumor sample anymore)" ) group4 = parser.add_argument_group('Module fa2wgs arguments') default = 50 group4.add_argument( '-d', '--tumor_depth', type=check_depth, default=default, metavar='FLOAT', help= 'the mean depth of tumor sample for fa2wgs to simulate NGS reads [{}]'. format(default)) default = 0 group4.add_argument( '-D', '--normal_depth', type=check_depth, default=default, metavar='FLOAT', help= 'the mean depth of normal sample for fa2wgs to simulate NGS reads [{}]' .format(default)) default = 'art_illumina --noALN --quiet --paired --mflen 500 --sdev 20' group4.add_argument( '--art', type=str, default=default, metavar='STR', help="the parameters for ART program ['{}']".format(default)) group5 = parser.add_argument_group('Module fa2wes arguments') default = None group5.add_argument( '--probe', metavar='FILE', type=check_file, default=default, help='The file containing the probe sequences (FASTA format) [{}]'. format(default)) default = None group5.add_argument( '--target', metavar='FILE', type=str, default=default, help='The Target file containing the target regions (BED format)') default = 0 group5sub1 = group5.add_mutually_exclusive_group() group5sub2 = group5.add_mutually_exclusive_group() group5sub1.add_argument( '--tumor_rdepth', type=check_depth, default=default, metavar='FLOAT', help= 'the mean depth of tumor sample for fa2wes to simulate NGS reads [{}]'. format(default)) default = 0 group5sub1.add_argument( '--tumor_rnum', metavar='INT', type=int, default=default, help='The number of short reads to simulate for tumor sample [{}]'. format(default)) default = 0 group5sub2.add_argument( '--normal_rdepth', type=check_depth, default=default, metavar='FLOAT', help= 'The mean depth of normal sample for fa2wes to simulate NGS reads [{}]' .format(default)) default = 0 group5sub2.add_argument( '--normal_rnum', metavar='INT', type=int, default=default, help='The number of short reads to simulate for normal sample [{}]'. format(default)) default = 'wessim' group5.add_argument( '--simulator', default=default, choices=['wessim', 'capgem'], action=TargetAction, help= 'The whole-exome sequencing simulator used for simulating short reads [{}]' .format(default)) default = RATIO_WESSIM group5.add_argument( '--ontarget_ratio', metavar='FLOAT', type=float, default=default, help= 'The percentage that simulated reads are expected to be from the target regions. \ It is dependent on the simulator. The default value is {} for wessim and {} for \ capgem [{}]'.format(RATIO_WESSIM, RATIO_CAPGEM, default)) default = None group5.add_argument( '--error_model', metavar='FILE', type=check_file, default=default, help= 'The file containing the empirical error model for NGS reads generated by GemErr \ (It must be provided when capgem or wessim is used for simulation) [{}]' .format(default)) default = "snakemake --rerun-incomplete -k --latency-wait 120" group5.add_argument( '--snakemake', metavar='STR', type=check_snakemake, default=default, help= "The snakemake command used for calling a whole-exome sequencing simulator. \ The Snakefile for a simulator is under the directory 'wes/config' of the source code. \ Additional parameters for a simulator can be adjusted in the Snakefile ['{}']" .format(default)) default = 2 group5.add_argument( '--out_level', type=int, choices=[0, 1, 2], default=default, help= "The level used to indicate how many intermediate output files are kept. \ Level 0: keep all the files. \ Level 1: keep files that are necessary for rerunning simulation \ ('config', 'genome_index', 'mapping', 'merged', and 'separate'). \ Level 2: keep only final results ('merged' and 'separate') [{}]". format(default)) args = parser.parse_args() if args.prune and args.single: raise argparse.ArgumentTypeError( "Can not prune the tree in single cell mode! Set '--prune 0' if you want to simulate single cell data." ) with open(args.config, 'r') as configfile: config = yaml.safe_load(configfile) check_config_file(config=config) if args.type in ['WES', 'BOTH']: if args.probe == None: raise argparse.ArgumentTypeError( "--probe is required to simulate WES data!") if args.target == None: raise argparse.ArgumentTypeError( "--target is required to simulate WES data!") if args.tumor_rdepth != 0 and args.tumor_rnum != 0: raise argparse.ArgumentTypeError( "--tumor_rdepth is not allowed to use together with --tumor_rnum!" ) if args.normal_rdepth != 0 and args.normal_rnum != 0: raise argparse.ArgumentTypeError( "--normal_rdepth is not allowed to use together with --normal_rnum!" ) check_program(args.simulator) #get absolute paths for the input files reference = os.path.abspath(args.reference) vcf = os.path.abspath(args.vcf) tree = os.path.abspath(args.tree) config = os.path.abspath(args.config) if args.trunk_vars: trunk_vars = os.path.abspath(args.trunk_vars) if args.affiliation: affiliation = os.path.abspath(args.affiliation) if args.cnvl_dist: cnvl_dist = os.path.abspath(args.cnvl_dist) if args.sectors: sectors = os.path.abspath(args.sectors) outdir = args.output if args.start == 1: try: os.mkdir(outdir, mode=0o755) except FileExistsError as e: raise OutputExistsError( "'{}' already exists. Try another directory to output! (-o/--output)" .format(outdir)) from e else: assert os.path.isdir(outdir),"Couldn't start from step {}, ".format(args.start)+\ "because I can not find the directory of previous results: '{}'.".format(outdir) os.chdir(outdir) ###### logging and random seed setting logging.basicConfig(filename=args.log if args.start == 1 else args.log + '.start' + str(args.start), filemode='w', format='[%(asctime)s] %(levelname)s: %(message)s', datefmt='%m-%d %H:%M:%S', level='INFO') argv_copy = sys.argv[:] if '--art' in argv_copy: art_index = argv_copy.index('--art') argv_copy[art_index + 1] = "'{}'".format(argv_copy[art_index + 1]) if '--snakemake' in argv_copy: snakemake_index = argv_copy.index('--snakemake') argv_copy[snakemake_index + 1] = "'{}'".format( argv_copy[snakemake_index + 1]) argv_copy.insert(1, 'allinone') logging.info(' Command: %s', ' '.join(argv_copy)) if args.random_seed == None: seed = random_int() else: seed = args.random_seed logging.info(' Random seed: %s', seed) numpy.random.seed(seed) #subfolders normal_fa = 'normal_fa' tumor_fa = 'tumor_fa' tumor_chain = 'tumor_chain' #map file map_dir = 'map' #vcf2fa if args.start < 2: cmd_params = [ sys.argv[0], 'vcf2fa', '--vcf', vcf, '--reference', reference, '--output', normal_fa, '--autosomes', args.autosomes ] if args.sex_chr: cmd_params.extend(['--sex_chr', args.sex_chr]) logging.info(' Command: %s', ' '.join(cmd_params)) subprocess.run(args=cmd_params, check=True) #phylovar #I place random_int() here as I do not want to skip it in any situation. #Without this, you can not replicate the result with different --start setting. random_n = random_int() if args.start < 3: if os.path.isdir(tumor_chain): shutil.rmtree(tumor_chain) elif os.path.isfile(tumor_chain): os.remove(tumor_chain) cmd_params = [ sys.argv[0], 'phylovar', '--tree', tree, '--config', config, '--purity', str(args.purity), '--prune', str(args.prune), '--random_seed', str(random_n), '--map', map_dir, '--chain', tumor_chain ] if args.sex_chr: cmd_params.extend(['--sex_chr', args.sex_chr]) if args.trunk_vars: cmd_params.extend(['--trunk_vars', trunk_vars]) if args.affiliation: cmd_params.extend(['--affiliation', affiliation]) if args.cnvl_dist: cmd_params.extend(['--cnvl_dist', cnvl_dist]) if args.trunk_length: cmd_params.extend(['--trunk_length', str(args.trunk_length)]) logging.info(' Command: %s', ' '.join(cmd_params)) subprocess.run(args=cmd_params, check=True) #chain2fa if args.start < 4: if os.path.isdir(tumor_fa): shutil.rmtree(tumor_fa) elif os.path.isfile(tumor_fa): os.remove(tumor_fa) cmd_params = [ sys.argv[0], 'chain2fa', '--chain', tumor_chain, '--normal', ','.join([ os.path.join(normal_fa, 'normal.parental_{}.fa'.format(x)) for x in (0, 1) ]), '--cores', str(args.cores), '--output', tumor_fa ] logging.info(' Command: %s', ' '.join(cmd_params)) subprocess.run(args=cmd_params, check=True) #fa2wgs random_n = random_int() if args.type in ['WGS', 'BOTH']: reads_dir = 'wgs_reads' if os.path.isdir(reads_dir): shutil.rmtree(reads_dir) elif os.path.isfile(reads_dir): os.remove(reads_dir) cmd_params = [ sys.argv[0], 'fa2wgs', '--normal', normal_fa, '--tumor', tumor_fa, '--map', map_dir, '--normal_depth', str(args.normal_depth), '--output', reads_dir, '--random_seed', str(random_n), '--cores', str(args.cores), '--rlen', str(args.rlen), '--art', args.art ] if args.sectors: cmd_params.extend(['--sectors', sectors]) else: cmd_params.extend(['--tumor_depth', str(args.tumor_depth)]) cmd_params.extend(['--purity', str(args.purity)]) if args.single: cmd_params.extend(['--single']) cmd_params_copy = cmd_params[:] art_index = cmd_params_copy.index('--art') cmd_params_copy[art_index + 1] = "'{}'".format( cmd_params_copy[art_index + 1]) logging.info(' Command: %s', ' '.join(cmd_params_copy)) subprocess.run(args=cmd_params, check=True) #fa2wes random_n = random_int() if args.type in ['WES', 'BOTH']: reads_dir = 'wes_reads' cmd_params = [ sys.argv[0], 'fa2wes', '--normal', normal_fa, '--tumor', tumor_fa, '--map', map_dir, '--probe', args.probe, '--target', args.target, '--simulator', args.simulator, '--ontarget_ratio', str(args.ontarget_ratio), '--rlen', str(args.rlen), '--purity', str(args.purity), '--output', reads_dir, '--random_seed', str(random_n), '--cores', str(args.cores), '--out_level', str(args.out_level), '--snakemake', args.snakemake ] if args.sectors: cmd_params.extend(['--sectors', sectors]) if args.tumor_rdepth: cmd_params.extend(['--tumor_rdepth', str(args.tumor_rdepth)]) elif args.tumor_rnum: cmd_params.extend(['--tumor_rnum', str(args.tumor_rnum)]) if args.normal_rdepth: cmd_params.extend(['--normal_rdepth', str(args.normal_rdepth)]) elif args.normal_rnum: cmd_params.extend(['--normal_rnum', str(args.normal_rnum)]) if args.error_model: cmd_params.extend(['--error_model', args.error_model]) if args.separate: cmd_params.extend(['--separate']) if args.single: cmd_params.extend(['--single']) cmd_params_copy = cmd_params[:] snakemake_index = cmd_params_copy.index('--snakemake') snakemake_str = cmd_params_copy[snakemake_index + 1] if "'" in snakemake_str: snakemake_str = snakemake_str.replace("'", '"') cmd_params_copy[snakemake_index + 1] = "'{}'".format(snakemake_str) logging.info(' Command: %s', ' '.join(cmd_params_copy)) subprocess.run(args=cmd_params, check=True)
def main(progname=None): t0 = time.time() prog = progname if progname else sys.argv[0] parser = argparse.ArgumentParser( description= 'a wrapper of simulating targeted capture sequencing from reference genome files', prog=prog) group1 = parser.add_argument_group('Input arguments') group1.add_argument( '-n', '--normal', metavar='DIR', type=check_folder, required=True, help='The directory of the fasta files of normal genomes') group1.add_argument( '-t', '--tumor', metavar='DIR', type=check_folder, required=True, help='The directory of the fasta files of tumor genomes') group1.add_argument( '-m', '--map', type=check_folder, required=True, metavar='DIR', help= 'The directory of map files, which contains the relationship between tip nodes and samples' ) default = None group1.add_argument( '-s', '--sectors', type=check_file, default=default, metavar='FILE', help= 'The file containing purity and depth profile of each tumor sector. \ After this setting, -d/-D/-p will be ignored [{}]'.format( default)) group1.add_argument( '--probe', metavar='FILE', type=check_file, required=True, help='The Probe file containing the probe sequences (FASTA format)') group1.add_argument( '--target', metavar='FILE', type=check_file, required=True, help='The Target file containing the target regions (BED format)') default = None group1.add_argument( '--error_model', metavar='FILE', type=check_file, help= 'The file containing the empirical error model for NGS reads generated by GemErr (It must be provided when capgem or wessim is used for simulation) [{}]' .format(default)) group2 = parser.add_argument_group('Arguments for simulation') default = 0.6 group2.add_argument( '-p', '--purity', metavar='FLOAT', type=check_purity, default=default, help='The proportion of tumor cells in simulated sample [{}]'.format( default)) default = 150 group2.add_argument('--rlen', metavar='INT', type=int, default=default, help='Illumina: read length [{}]'.format(default)) group2.add_argument('--single_end', action='store_true', help='Simulating single-end reads') group = group2.add_mutually_exclusive_group() default = 0 group.add_argument( '-d', '--tumor_rdepth', metavar='FLOAT', type=check_depth, default=default, help='The mean depth of tumor sample for simulating short reads [{}]'. format(default)) default = 0 group.add_argument( '-r', '--tumor_rnum', metavar='INT', type=int, default=default, help='The number of short reads to simulate for tumor sample [{}]'. format(default)) group = group2.add_mutually_exclusive_group() default = 0 group.add_argument( '-D', '--normal_rdepth', metavar='FLOAT', type=check_depth, default=default, help='The mean depth of normal sample for simulating short reads [{}]'. format(default)) default = 0 group.add_argument( '-R', '--normal_rnum', metavar='INT', type=int, default=default, help='The number of short reads to simulate for normal sample [{}]'. format(default)) default = None group2.add_argument( '--random_seed', metavar='INT', type=check_seed, help='The seed for random number generator [{}]'.format(default)) default = 'wessim' group2.add_argument( '--simulator', default=default, choices=['wessim', 'capgem'], action=TargetAction, type=check_program, help= 'The whole-exome sequencing simulator used for simulating short reads [{}]' .format(default)) default = RATIO_WESSIM group2.add_argument( '--ontarget_ratio', metavar='FLOAT', type=float, default=default, help= 'The percentage that simulated reads are expected to be from the target regions. It is dependent on the simulator. The default value is {} for wessim and {} for capgem [{}]' .format(RATIO_WESSIM, RATIO_CAPGEM, default)) group2.add_argument( '--single', action='store_true', help= 'single cell mode. After this setting, -p will be ignored and the value of --tumor_rdepth and --tumor_rnum are for each tumor cell (not the whole tumor sample anymore)' ) default = "snakemake --rerun-incomplete -k --latency-wait 120" group2.add_argument( '--snakemake', metavar='STR', type=check_snakemake, default=default, help= "The snakemake command used for calling a whole-exome sequencing simulator. The Snakefile for a simulator is under the directory 'wes/config' of the source code. Additional parameters for a simulator can be adjusted in the Snakefile ['{}']" .format(default)) default = 1 group2.add_argument( '--cores', type=int, default=default, metavar='INT', help= "The number of cores used to run the program (including snakemake). If '--cores' or '--jobs' or '-j' is specified in the options of snakemake, the value specified by '--cores' here will be ignored when snakemake is called [{}]" .format(default)) group3 = parser.add_argument_group('Output arguments') default = 'wes_reads' group3.add_argument('-o', '--output', metavar='DIR', type=str, default=default, help='The output directory [{}]'.format(default)) default = 'fa2wes.log' group3.add_argument( '-g', '--log', metavar='FILE', type=str, default=default, help='The log file to save the settings of each command [{}]'.format( default)) default = 2 group3.add_argument( '--out_level', type=int, choices=[0, 1, 2], default=default, help= "The level used to indicate how many intermediate output files are kept. \ Level 0: keep all the files.\ Level 1: keep files that are necessary for rerunning simulation ('config', 'genome_index', 'mapping', 'merged', and 'separate'). \ Level 2: keep only final results ('merged' and 'separate') [{}]" .format(default)) group3.add_argument('--separate', action='store_true', help='Output the reads of each genome separately') args = parser.parse_args() check_normal_fa(args.normal) # logging and random seed setting logging.basicConfig(filename=args.log, filemode='w', format='[%(asctime)s] %(levelname)s: %(message)s', datefmt='%m-%d %H:%M:%S', level='INFO') argv_copy = sys.argv[:] try: snakemake_index = argv_copy.index('--snakemake') # Single quotes are required for the snakemake command snakemake_str = argv_copy[snakemake_index + 1] if "'" in snakemake_str: snakemake_str = snakemake_str.replace("'", '"') argv_copy[snakemake_index + 1] = "'{}'".format(snakemake_str) except ValueError: pass argv_copy.insert(1, 'fa2wes') logging.info(' Command: %s', ' '.join(argv_copy)) if args.random_seed == None: seed = random_int() else: seed = args.random_seed logging.info(' Ontarget ratio: %s', str(args.ontarget_ratio)) logging.info(' Random seed: %d', seed) numpy.random.seed(seed) # Create output folders if os.path.exists(args.output): if os.path.isdir(args.output): pass else: raise OutputExistsError( "A file in the name of '{}' exists.\nDelete it or try another name as output folder." .format(args.output)) else: os.makedirs(args.output, mode=0o755) if args.single_end: rlen = args.rlen else: rlen = args.rlen * 2 wes_dir = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'wes') # Add path variables if args.simulator == 'capsim': # Not exposed to user for simplificity snake_file = os.path.join(wes_dir, 'config/Snakefile_capsim') elif args.simulator == 'wessim': snake_file = os.path.join(wes_dir, 'config/Snakefile_wessim') wessim_dir = os.path.join(wes_dir, 'wessim') os.environ['PATH'] += os.pathsep + wessim_dir else: # capgem snake_file = os.path.join(wes_dir, 'config/Snakefile_capgem') capgem_dir = os.path.join(wes_dir, 'capgem') if os.path.exists(os.path.join(capgem_dir, 'bin')): os.environ['PATH'] += os.pathsep + os.path.join(capgem_dir, 'bin') os.environ['PATH'] += os.pathsep + os.path.join(capgem_dir, 'src') # Ensure that capsim is installed prog = 'capsim' if shutil.which(prog) is None: raise argparse.ArgumentTypeError( "Cannot find program '{}'. Please ensure that you have installed it!" .format(prog)) assert os.path.isfile( snake_file ), 'Cannot find Snakefile {} under the program directory:\n'.format( snake_file) normal_gsize = compute_normal_gsize(args.normal) target_size = compute_target_size(args.target) logging.info(' Size of target region: %s bp', str(target_size)) # Simulate normal and tumor sample at the same time if (args.tumor_rdepth > 0 or args.tumor_rnum > 0) and (args.normal_rdepth > 0 or args.normal_rnum > 0): sectors = parse_sectors(args) check_tumor_fa(args.tumor, sectors, args.simulator) outdir = os.path.abspath(args.output) configdir = os.path.join(outdir, 'config') if not os.path.exists(configdir): os.makedirs(configdir) sample_file = os.path.join(outdir, 'config/sample.yaml') total_num_splits = prepare_yaml_all(sample_file, rlen, args, sectors, normal_gsize, target_size) logging.info(' Number of splits in simulation: %d', total_num_splits) run_snakemake(outdir, args, sample_file, snake_file) merge_normal_sample(args, outdir) merge_tumor_sample(args, sectors, outdir) clean_output(args.out_level, outdir) # Separate the simulation of tumor and normal samples elif args.tumor_rdepth > 0 or args.tumor_rnum > 0: sectors = parse_sectors(args) check_tumor_fa(args.tumor, sectors, args.simulator) outdir = os.path.join(os.path.abspath(args.output), "tumor") if not os.path.exists(outdir): os.makedirs(outdir) configdir = os.path.join(outdir, 'config') if not os.path.exists(configdir): os.makedirs(configdir) sample_file = os.path.join(outdir, 'config/sample.yaml') total_num_splits = prepare_yaml_tumor(sample_file, rlen, args, sectors, normal_gsize, target_size) logging.info(' Number of splits in simulation: %d', total_num_splits) run_snakemake(outdir, args, sample_file, snake_file) merge_tumor_sample(args, sectors, outdir) clean_output(args.out_level, outdir) elif args.normal_rdepth > 0 or args.normal_rnum > 0: outdir = os.path.join(os.path.abspath(args.output), 'normal') if not os.path.exists(outdir): os.makedirs(outdir) configdir = os.path.join(outdir, 'config') if not os.path.exists(configdir): os.makedirs(configdir) sample_file = os.path.join(outdir, 'config/sample.yaml') total_num_splits = prepare_yaml_normal(sample_file, rlen, args, normal_gsize, target_size) logging.info(' Number of splits in simulation: %d', total_num_splits) run_snakemake(outdir, args, sample_file, snake_file) merge_normal_sample(args, outdir) clean_output(args.out_level, outdir) else: logging.info('Please specify sequening depth!') t1 = time.time() print("Total time running {}: {} seconds".format(prog, str(t1 - t0)))
def write_sample_tumor(fout, rlen, args, sectors, normal_gsize, target_size): total_num_splits = 0 for sector in sorted(sectors.keys()): tipnode_leaves = sectors[sector]['composition'] if not args.single: tumor_cells = sum(tipnode_leaves.values()) purity = sectors[sector]['purity'] total_cells = tumor_cells / purity logging.info( ' Number of total cells in tumor sample {}: {:.2f}'.format( sector, total_cells)) normal_cells = total_cells - tumor_cells logging.info( ' Number of normal cells in tumor sample {}: {:.2f}'.format( sector, normal_cells)) # normal_dna = normal_gsize * normal_cells tipnode_gsize, tumor_dna = compute_tumor_dna(args.tumor, tipnode_leaves) # total_dna = (normal_dna + tumor_dna) depth = sectors[sector]['depth'] if depth > 0: total_rnum = int( (depth * target_size) / (rlen * args.ontarget_ratio)) else: total_rnum = args.tumor_rnum logging.info( ' Total number of reads to simulate for tumor sample {}: {}'. format(sector, total_rnum)) MAX_READNUM = int(total_rnum * MAX_READFRAC) # two normal cell haplotypes, only generated under non-single mode if not args.single: for parental in 0, 1: ref = '{}/normal.parental_{}.fa'.format(args.normal, parental) fullname = os.path.abspath(ref) cell_proportion = normal_cells / total_cells proportion = cell_proportion * \ genomesize(fasta=ref) / normal_gsize readnum = int(proportion * total_rnum) if readnum > MAX_READNUM: num_splits = int(numpy.ceil(readnum / MAX_READNUM)) total_num_splits += num_splits for split in range(1, num_splits + 1): fout.write(' {}_normal.parental_{}_{}:\n'.format( sector, parental, str(split))) fout.write( ' gid: normal.parental_{}\n'.format(parental)) fout.write(' cell_proportion: {}\n'.format( str(cell_proportion))) fout.write(' proportion: {}\n'.format( str(proportion / num_splits))) fout.write(' split: {}\n'.format(str(split))) split_readnum = int(numpy.ceil(readnum / num_splits)) fout.write(' readnum: {}\n'.format( str(split_readnum))) seed = random_int() fout.write(' seed: {}\n'.format(str(seed))) else: total_num_splits += 1 fout.write(' {}_normal.parental_{}:\n'.format( sector, parental)) fout.write( ' gid: normal.parental_{}\n'.format(parental)) fout.write(' cell_proportion: {}\n'.format( str(cell_proportion))) fout.write(' proportion: {}\n'.format(str(proportion))) fout.write(' readnum: {}\n'.format(str(readnum))) seed = random_int() fout.write(' seed: {}\n'.format(str(seed))) # tumor cells haplotypes for tipnode in sorted(tipnode_leaves.keys()): for parental in 0, 1: ref = '{}/{}.parental_{}.fa'.format(args.tumor, tipnode, parental) fullname = os.path.abspath(ref) if args.single: cell_proportion = 1 else: cell_proportion = tipnode_leaves[tipnode] / total_cells proportion = cell_proportion * \ tipnode_gsize[tipnode][parental] / \ tipnode_gsize[tipnode][2] readnum = int(proportion * total_rnum) if readnum > MAX_READNUM: num_splits = int(numpy.ceil(readnum / MAX_READNUM)) total_num_splits += num_splits for split in range(1, num_splits + 1): fout.write(' {}_{}.parental_{}_{}:\n'.format( sector, tipnode, parental, str(split))) fout.write(' gid: {}.parental_{}\n'.format( tipnode, parental)) fout.write(' proportion: {}\n'.format( str(proportion / num_splits))) fout.write(' split: {}\n'.format(str(split))) split_readnum = int(numpy.ceil(readnum / num_splits)) fout.write(' readnum: {}\n'.format( str(split_readnum))) seed = random_int() fout.write(' seed: {}\n'.format(str(seed))) else: total_num_splits += 1 fout.write(' {}_{}.parental_{}:\n'.format( sector, tipnode, parental)) fout.write(' gid: {}.parental_{}\n'.format( tipnode, parental)) fout.write(' cell_proportion: {}\n'.format( str(cell_proportion))) fout.write(' proportion: {}\n'.format(str(proportion))) fout.write(' readnum: {}\n'.format(str(readnum))) seed = random_int() fout.write(' seed: {}\n'.format(str(seed))) return total_num_splits
def main(progname=None): t0 = time.time() prog = progname if progname else sys.argv[0] parser=argparse.ArgumentParser( description='A wrapper of simulating WGS reads from normal and tumor genome fasta', prog=prog) group1 = parser.add_argument_group('Input arguments') group1.add_argument('-n','--normal',type=check_folder,required=True,metavar='DIR', help='the directory of the normal fasta') group1.add_argument('-t','--tumor',type=check_folder,required=True,metavar='DIR', help='the directory of the tumor fasta') group1.add_argument('-m','--map',type=check_folder,required=True,metavar='DIR', help='the directory of map files, which contains the relationship between tip nodes and samples') default=None group1.add_argument('-s','--sectors',type=check_file,default=default,metavar='FILE', help='the file containing purity and depth profile of each tumor sector. \ After this setting, -d/-p will be ignored [{}]'.format(default)) group2 = parser.add_argument_group('Arguments for simulation') default=50 group2.add_argument('-d','--tumor_depth',type=check_depth,default=default,metavar='FLOAT', help='the mean depth of tumor sample for ART to simulate WGS reads [{}]'.format(default)) default=0 group2.add_argument('-D','--normal_depth',type=check_depth,default=default,metavar='FLOAT', help='the mean depth of normal sample for ART to simulate WGS reads [{}]'.format(default)) default=0.6 group2.add_argument('-p','--purity',type=check_purity,default=default,metavar='FLOAT', help='the proportion of tumor cells in simulated tumor sample [{}]'.format(default)) default=None group2.add_argument('--random_seed',type=check_seed,metavar='INT', help='the seed for random number generator [{}]'.format(default)) default=150 group2.add_argument('--rlen',type=int,default=default,metavar='INT', help="the length of reads to simulate [{}]".format(default)) default='art_illumina --noALN --quiet --paired --mflen 500 --sdev 20' group2.add_argument('--art',type=str,default=default,metavar='STR', help="the parameters for ART program ['{}']".format(default)) default=1 group2.add_argument('--cores',type=int,default=default,metavar='INT', help='number of cores used to run the program [{}]'.format(default)) group2.add_argument('--separate',action="store_true", help="keep each tip node's WGS reads file separately") group2.add_argument('--single',action="store_true", help="single cell mode. "+\ "After this setting, -p will be ignored and the value of --tumor_depth is the depth of each tumor cell "+\ "(not the total depth of tumor sample anymore).") group3 = parser.add_argument_group('Output arguments') default='art_reads' group3.add_argument('-o','--output',type=str,default=default,metavar='DIR', help='output directory [{}]'.format(default)) default='fa2wgs.log' group3.add_argument('-g','--log',type=str,default=default,metavar='FILE', help='the log file to save the settings of each command [{}]'.format(default)) args=parser.parse_args() #always compress the simulated fastq files compress=True #logging and random seed setting logging.basicConfig(filename=args.log, filemode='w',format='[%(asctime)s] %(levelname)s: %(message)s', datefmt='%m-%d %H:%M:%S',level='INFO') argv_copy=sys.argv[:] if '--art' in argv_copy: art_index=argv_copy.index('--art') argv_copy[art_index+1]="'{}'".format(argv_copy[art_index+1]) argv_copy.insert(1,'fa2wgs') logging.info(' Command: %s',' '.join(argv_copy)) if args.random_seed==None: seed=random_int() else: seed=args.random_seed logging.info(' Random seed: %s',seed) numpy.random.seed(seed) #construct the sectors dictionary to store the meta information of all tumor sectors sectors={} if args.sectors!=None: sectors=read_sectors_file(f=args.sectors) for sector in sectors: mapfile=os.path.join(args.map,'{}.tipnode.map'.format(sector)) assert os.path.isfile(mapfile),\ "Couldn't find the map file ({}.tipnode.map) for sector '{}' ".format(sector,sector)+\ "under the map directory ({}).".format(os.path.abspath(args.map)) else: mapfiles=glob.glob(os.path.join(args.map,'*.tipnode.map')) infered_sectors=['.'.join(os.path.basename(x).split('.')[:-2]) for x in mapfiles] for sector in infered_sectors: sectors[sector]={'purity':args.purity,'depth':args.tumor_depth} for sector in sectors: mapfile=os.path.join(args.map,'{}.tipnode.map'.format(sector)) sectors[sector]['composition']=tipnode_leaves_counting(f=mapfile) #exit the program if you do NOT want to simulate any reads for normal and tumor samples if args.normal_depth==0: for sector in sectors: if sectors[sector]['depth']!=0: break else: sys.exit('Do nothing as the depth for each sample is 0!') #single cell mode or bulk tumor mode if args.single: for sector in sectors: for tipnode,leaves_n in sectors[sector]['composition'].items(): assert leaves_n==1,\ 'In single mode, each tip node should represent only one cell.\n'+\ 'But {} leaves are found underneath tipnode {} in one of your map files!'.format(leaves_n,tipnode) #create index file (.fai) for each fasta pool=multiprocessing.Pool(processes=args.cores) tipnodes=set() for sector in sectors: tipnodes=tipnodes.union(set(sectors[sector]['composition'].keys())) results=[] for parental in 0,1: fasta=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental)) assert os.path.isfile(fasta),\ "Couldn't find {} under the normal directory: {}".format(fasta,args.normal) results.append(pool.apply_async(build_fai,args=(fasta,))) for tipnode in tipnodes: fasta=os.path.join(args.tumor,'{}.parental_{}.fa'.format(tipnode,parental)) assert os.path.isfile(fasta),\ "Couldn't find {} under the tumor directory: {}".format(fasta,args.tumor) results.append(pool.apply_async(build_fai,args=(fasta,))) pool.close() pool.join() for result in results: result.get() #create output folders if os.path.exists(args.output): if os.path.isdir(args.output): pass else: raise OutputExistsError("A FILE in the name of '{}' exists.\nDelete it or try another name as output folder.".format(args.output)) else: os.mkdir(args.output,mode=0o755) normal_dir=os.path.join(args.output,'normal') if args.normal_depth>0: try: os.mkdir(normal_dir,mode=0o755) except FileExistsError as e: raise OutputExistsError("'{}' exists already! \nCan not use it as the output folder of normal WGS reads.".format(normal_dir)+ '\nDelete it or use another folder as output folder.') from e #collect simulation parameters first params_matrix=[] total_sim_bases=0 art_params=args.art #collect genome size for each genome normal_gsize=0 for parental in 0,1: normal_gsize+=genomesize(fasta=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental))) tipnode_gsize={} for tipnode in tipnodes: #The value of tipnode_gsize[tipnode] is a list of three elements: #0)genomesize of parental 0 #1)genomesize of parental 1 #2)the sum of parental 0 and 1 tipnode_gsize[tipnode]=[] for parental in 0,1: tipnode_gsize[tipnode].append(genomesize(fasta=os.path.join(args.tumor,'{}.parental_{}.fa'.format(tipnode,parental)))) tipnode_gsize[tipnode].append(tipnode_gsize[tipnode][0]+tipnode_gsize[tipnode][1]) #simulation for normal sample if args.normal_depth>0: for parental in 0,1: prefix=os.path.join(normal_dir,'normal.parental_{}.'.format(parental)) fcov=args.normal_depth/2 ref=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental)) sim_cfg={ 'gsize':normal_gsize/2, 'base_cmd':art_params, 'rlen':args.rlen, 'fcov':fcov, 'in':ref, 'out':prefix, 'id':'nm_prt{}'.format(parental)} params_matrix.append(sim_cfg) total_sim_bases+=sim_cfg['gsize']*sim_cfg['fcov'] #simulation for tumor sample for sector in sorted(sectors.keys()): if sectors[sector]['depth']>0: #compute coverage and run ART sector_dir=os.path.join(args.output,sector) try: os.mkdir(sector_dir,mode=0o755) except FileExistsError as e: raise OutputExistsError("'{}' exists already! \nCan not use it as the output folder of tumor WGS reads.".format(sector_dir)+ '\nDelete it or use another folder as output folder.') from e tipnode_leaves=sectors[sector]['composition'] sector_sim_bases=normal_gsize/2*sectors[sector]['depth'] tumor_cells=sum(tipnode_leaves.values()) total_cells=tumor_cells/sectors[sector]['purity'] normal_cells=total_cells-tumor_cells normal_dna=normal_gsize*normal_cells tumor_dna=0 for tipnode,leaves_n in tipnode_leaves.items(): tumor_dna+=tipnode_gsize[tipnode][2]*leaves_n mean_depth_per_base=sector_sim_bases/(normal_dna+tumor_dna) #two normal cell haplotypes if not args.single: for parental in 0,1: prefix=os.path.join(sector_dir,'normal.parental_{}.'.format(parental)) fcov=normal_cells*mean_depth_per_base ref=os.path.join(args.normal,'normal.parental_{}.fa'.format(parental)) sim_cfg={ 'gsize':normal_gsize/2, 'base_cmd':art_params, 'rlen':args.rlen, 'fcov':fcov, 'in':ref, 'out':prefix, 'id':'nm_prt{}'.format(parental)} params_matrix.append(sim_cfg) total_sim_bases+=sim_cfg['gsize']*sim_cfg['fcov'] #tumor cells haplotypes for tipnode in sorted(tipnode_leaves.keys()): fcov=None if args.single: fcov=sector_sim_bases/tipnode_gsize[tipnode][2] else: fcov=tipnode_leaves[tipnode]*mean_depth_per_base for parental in 0,1: ref=os.path.join(args.tumor,'{}.parental_{}.fa'.format(tipnode,parental)) prefix=os.path.join(sector_dir,'{}.parental_{}.'.format(tipnode,parental)) sim_cfg={ 'gsize':tipnode_gsize[tipnode][parental], 'base_cmd':art_params, 'rlen':args.rlen, 'fcov':fcov, 'in':ref, 'out':prefix, 'id':'{}_prt{}'.format(tipnode,parental)} params_matrix.append(sim_cfg) total_sim_bases+=sim_cfg['gsize']*sim_cfg['fcov'] #generate fastq and compress them parallelly #every thread will generate at most 2 percent of the total data you want to simulate #In order to let users replicate the results (with same random seed) even using different number of cores, #I use the fixed size of block to parallelize the program. assert total_sim_bases>0,'The genome sizes of all cells in the sample is 0!' sizeBlock=total_sim_bases*0.02 final_params_matrix=[] for cfg in params_matrix: n=math.ceil(cfg['gsize']*cfg['fcov']/sizeBlock) if n==0: continue cfg['fcov']=round(cfg['fcov']/n,6) for i in range(n): final_params_matrix.append(cfg.copy()) final_params_matrix[-1]['out']=cfg['out']+'{:03d}.'.format(i) final_params_matrix[-1]['id']=cfg['id']+'_{:03d}-'.format(i) final_params_matrix[-1]['rndSeed']=str(random_int()) pool=multiprocessing.Pool(processes=args.cores) results=[] for x in final_params_matrix: results.append(pool.apply_async(generate_fq,args=(x,compress))) pool.close() pool.join() for result in results: result.get() #merge small fastq files into one fastq for normal/tumor sample sample_fq_files=[] suffixes=['fq','1.fq','2.fq'] if compress: suffixes=[x+'.gz' for x in suffixes] if args.normal_depth>0: for suffix in suffixes: prefix=os.path.join(normal_dir,'normal.parental_[01].[0-9][0-9][0-9].') source=glob.glob(prefix+suffix) if len(source): target=os.path.join(normal_dir,'normal.{}'.format(suffix)) source.sort() sample_fq_files.append([target,source]) for sector in sorted(sectors.keys()): if sectors[sector]['depth']>0: sector_dir=os.path.join(args.output,sector) tipnode_leaves=sectors[sector]['composition'] for suffix in suffixes: if args.single or args.separate: for tipnode in ['normal']+sorted(tipnode_leaves.keys()): prefix=os.path.join(sector_dir,'{}.parental_[01].[0-9][0-9][0-9].'.format(tipnode)) source=glob.glob(prefix+suffix) if len(source): target=os.path.join(sector_dir,'{}.{}'.format(tipnode,suffix)) source.sort() sample_fq_files.append([target,source]) else: prefix=os.path.join(sector_dir,'*.parental_[01].[0-9][0-9][0-9].') source=glob.glob(prefix+suffix) if len(source): target=os.path.join(sector_dir,'{}.{}'.format(sector,suffix)) source.sort() sample_fq_files.append([target,source]) pool=multiprocessing.Pool(processes=args.cores) results=[] for x in sample_fq_files: results.append(pool.apply_async(merge_fq,args=x)) pool.close() pool.join() for result in results: result.get() t1 = time.time() print ("Total time running {}: {} seconds".format (prog, str(t1-t0)))