def main(): parser = argparse.ArgumentParser() parser.add_argument("--smc", required=True, dest="smc", type=str, help="SMC file for the region of interest") parser.add_argument("--pos", required=True, dest="pos", type=int, help="Position of STR in region") parser.add_argument("--mu", required=True, dest="mu", type=float, help="Mutation rate for mutation model") parser.add_argument("--beta", required=True, dest="beta", type=float, help="Length constraint for mutation model") parser.add_argument("--pgeom", required=True, dest="pgeom", type=float, help="Geometric parameter for mutation model") parser.add_argument("--out", required=True, dest="out", type=str, help="Output path prefix for phased VCF (+ _strs.vcf)") parser.add_argument("--vcf", required=True, dest="vcf", type=str, help="Input path for VCF containing haploid STR calls") parser.add_argument("--samps", required=False, dest="samps", type=str, help="File containing list of samples to consider") parser.add_argument("--thresh", required=False, dest="thresh", type=float, help="Posterior probability threshold required to report phasing", default=0.5) # Scaling factor from edge length to # of generations parser.add_argument("--gen_per_len", required=False, dest="gen_per_len", type=float, default=1.0) # Maximum TMRCA in generations parser.add_argument("--max_tmrca", required=False, dest="max_tmrca", type=int, default=25000) args = parser.parse_args() tree, leaf_names, leaf_indices = extract_newick_tree_from_smc(args.smc, args.pos) samples = read_sample_list(args.samps) if args.samps is not None else None # Read haploid STR genotypes nrepeats_dict, median_allele = read_haploid_str_gts(args.vcf, sample_set=samples) # Ensure that all of the sample nodes are contained within the tree for key in nrepeats_dict: if key not in leaf_indices: exit("ERROR: Sample %s not present in provided tree"%(key)) # Randomly pair haploid to construct pseudodiploids (node_id_1, node_id_2, num_repeat_a, num_repeat_b) pair_data = pair_gts(nrepeats_dict, leaf_indices, pairs=None) # Only deal with tree for diploid individuals if len(tree.leaf_nodes()) % 2 != 0: exit("ERROR: Tree contains an odd number of leaves") # Construct the mutation model print("Constructing the mutation model") allele_range, max_step = determine_allele_range(args.max_tmrca, args.mu, args.beta, args.pgeom, 0, 0) min_allele = -allele_range - max_step max_allele = allele_range + max_step mut_model = OUGeomSTRMutationModel(args.pgeom, args.mu, args.beta, allele_range, max_step = max_step) print("Min allele = %d, Max allele = %d"%(min_allele, max_allele)) # Ensure that the observed genotypes are within the allele range if len(pair_data) != 0: min_obs_allele = min(min(map(lambda x: x[2], pair_data)), min(map(lambda x: x[3], pair_data))) max_obs_allele = max(max(map(lambda x: x[2], pair_data)), max(map(lambda x: x[3], pair_data))) if min_obs_allele < min_allele or max_obs_allele > max_allele: exit("ERROR: Observed allele not within mutation model's allele range: (%d, %d)"%(min_obs_allele, max_obs_allele)) # Precompute the transition probabilities print("Calculating the transition probabilities") optimizer = MATRIX_OPTIMIZER(mut_model.trans_matrix, mut_model.min_n) optimizer.precompute_results() # Write out fixed paired information pairs_file = tempfile.mkstemp()[1] output = open(pairs_file, "w") for i in xrange(len(pair_data)): output.write("%d\t%d\t%d\t%d\n"%(pair_data[i][0], pair_data[i][1], pair_data[i][2]-min_allele, pair_data[i][3]-min_allele)) output.close() # Write out the factor graph file graph_file = tempfile.mkstemp()[1] write_factor_graph(tree, optimizer, pair_data, min_allele, max_allele, args.gen_per_len, graph_file) # Run c++ package, parse results and remove temporary files phase_cmd_path = os.path.dirname(os.path.realpath(__file__)) + "/str-imputer" phase_cmd = [phase_cmd_path, graph_file, pairs_file] proc = subprocess.Popen(phase_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() # TO DO: Utilize stderr messages to ensure convergence #print(proc.stderr.read().strip()) #res = proc.stdout.read().strip() res = stdout.strip() rm_cmd = ["rm", "-f", graph_file, pairs_file] subprocess.call(rm_cmd) # Assess accuracy, output the statistics and determine the new genotypes associated with each sample phased_repeat_dict, accuracy_string = process_phasing_result(res, leaf_names, min_allele, min_confidence = args.thresh) print(accuracy_string) # Construct a new VCF containing the phased alleles write_vcf(args.vcf, phased_repeat_dict, median_allele, args.out + "_strs.vcf")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--smc", required=True, dest="smc", type=str, help="SMC file for the region of interest") parser.add_argument("--chrom", required=True, dest="chrom", type=str, help="STR's Chromosome") parser.add_argument("--pos", required=True, dest="pos", type=int, help="Position of STR in region") parser.add_argument("--mu", required=True, dest="mu", type=float, help="Mutation rate for mutation model") parser.add_argument("--beta", required=True, dest="beta", type=float, help="Length constraint for mutation model") parser.add_argument("--pgeom", required=True, dest="pgeom", type=float, help="Geometric parameter for mutation model") parser.add_argument("--out", required=True, dest="out", type=str, help="Output path prefix for imputed/phased VCF") parser.add_argument("--vcf", required=True, dest="vcf", type=str, help="VCF containing STR calls") parser.add_argument("--samps", required=False, dest="samps", type=str, help="File containing list of samples to consider") parser.add_argument("--thresh", required=False, dest="thresh", type=float, help="Posterior probability threshold required to report phasing", default=0.5) parser.add_argument("--phase", required=False, dest="phase", action="store_true", help="Output phasing statistics", default=False) parser.add_argument("--impute", required=False, dest="impute", action="store_true", help="Output imputation statisttics", default=False) parser.add_argument("--diploid", required=False, dest="diploid", action="store_true", help="VCF contains diploid STR calls (instead of haploid calls)", default=False) # Scaling factor from edge length to # of generations parser.add_argument("--gen_per_len", required=False, dest="gen_per_len", type=float, default=1.0) # Maximum TMRCA in generations parser.add_argument("--max_tmrca", required=False, dest="max_tmrca", type=int, default=25000) args = parser.parse_args() if (not args.phase and not args.impute) or (args.phase and args.impute): exit("ERROR: Exactly one of --phase or --impute must be specified. Exiting...") tree, leaf_names, leaf_indices = extract_newick_tree_from_smc(args.smc, args.pos) samples = read_sample_list(args.samps) if args.samps is not None else None # Read STR genotypes if args.diploid: nrepeats_dict, median_allele = read_diploid_str_gts(args.vcf, args.chrom, args.pos, sample_set=samples) else: nrepeats_dict, median_allele = read_haploid_str_gts(args.vcf, args.chrom, args.pos, sample_set=samples) # Ensure that all of the sample nodes are contained within the tree for key in nrepeats_dict: if key not in leaf_indices: exit("ERROR: Sample %s not present in provided tree"%(key)) if args.diploid: # Pair _1 and _2 nodes together sample_names = list(set(map(lambda x: x.split("_")[0], nrepeats_dict.keys()))) pairs = map(lambda x: (x+"_1", x+"_2"), sample_names) pair_data = pair_gts(nrepeats_dict, leaf_indices, pairs=pairs) else: # Randomly pair haploid to construct pseudodiploids (node_id_1, node_id_2, num_repeat_a, num_repeat_b) pair_data = pair_gts(nrepeats_dict, leaf_indices, pairs=None) # Only deal with tree for even number of chromosomes if len(tree.leaf_nodes()) % 2 != 0: exit("ERROR: Tree contains an odd number of leaves") # Construct the mutation model print("Constructing the mutation model") allele_range, max_step = determine_allele_range(args.max_tmrca, args.mu, args.beta, args.pgeom, 0, 0) min_allele = -allele_range - max_step max_allele = allele_range + max_step mut_model = OUGeomSTRMutationModel(args.pgeom, args.mu, args.beta, allele_range, max_step = max_step) print("Min allele = %d, Max allele = %d"%(min_allele, max_allele)) # Ensure that the observed genotypes are within the allele range if len(pair_data) != 0: min_obs_allele = min(min(map(lambda x: x[2], pair_data)), min(map(lambda x: x[3], pair_data))) max_obs_allele = max(max(map(lambda x: x[2], pair_data)), max(map(lambda x: x[3], pair_data))) if min_obs_allele < min_allele or max_obs_allele > max_allele: exit("ERROR: Observed allele not within mutation model's allele range: (%d, %d)"%(min_obs_allele, max_obs_allele)) # Precompute the transition probabilities print("Calculating the transition probabilities") optimizer = MATRIX_OPTIMIZER(mut_model.trans_matrix, mut_model.min_n) optimizer.precompute_results() # Write out paired information for known diploid GTs pairs_file = tempfile.mkstemp()[1] output = open(pairs_file, "w") for i in xrange(len(pair_data)): output.write("%d\t%d\t%d\t%d\n"%(pair_data[i][0], pair_data[i][1], pair_data[i][2]-min_allele, pair_data[i][3]-min_allele)) output.close() # Write out ids for any leaves not included in the VCF (for potential imputation queries) ids_file = tempfile.mkstemp()[1] output = open(ids_file, "w") for leaf_name,leaf_id in leaf_indices.items(): if leaf_name not in nrepeats_dict: output.write("%d\n"%(leaf_id)) output.close() # Write out the factor graph file graph_file = tempfile.mkstemp()[1] write_factor_graph(tree, optimizer, pair_data, min_allele, max_allele, args.gen_per_len, graph_file) # Run c++ package, parse results and remove temporary files # TO DO: Utilize stderr messages to ensure convergence cmd_path = os.path.dirname(os.path.realpath(__file__)) + "/Phaser" if args.phase: cmd = [cmd_path, "--factor-graph", graph_file, "--pair-file", pairs_file] elif args.impute: cmd = [cmd_path, "--factor-graph", graph_file, "--id-file", ids_file] print("Running message-passing tool") proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)#, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() res = stdout.strip() rm_cmd = ["rm", "-f", graph_file, pairs_file, ids_file] subprocess.call(rm_cmd) # Assess accuracy, output the statistics and determine the new genotypes associated with each sample if args.phase: phased_repeat_dict, accuracy_string = process_phasing_result(res, leaf_names, min_allele, min_confidence=args.thresh) #print(accuracy_string) # Construct a new VCF containing the phased alleles if args.diploid: write_diploid_vcf(args.vcf, args.chrom, args.pos, phased_repeat_dict, median_allele, args.out + "_phased_strs.vcf") else: write_haploid_vcf(args.vcf, args.chrom, args.pos, phased_repeat_dict, median_allele, args.out + "_phased_strs.vcf") # Determine the most probable posterior genotype for each sample elif args.impute: imputed_repeat_dict,posterior_dict,dosage_dict = process_imputation_result(res, leaf_names, min_allele, max_allele) if args.diploid: write_diploid_vcf(args.vcf, args.chrom, args.pos, imputed_repeat_dict, median_allele, args.out + "_imputed_strs.vcf", posteriors=posterior_dict, dosages=dosage_dict) else: write_haploid_vcf(args.vcf, args.chrom, args.pos, imputed_repeat_dict, median_allele, args.out + "_imputed_strs.vcf", posteriors=posterior_dict, dosages=dosage_dict)