Пример #1
0
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument("scg_cov_file", help="input core gene coverages")

    parser.add_argument("gamma_star_file",
                        help="input MAP estimate frequencies")

    parser.add_argument("cov_file", help="mean contig/genes coverages")

    parser.add_argument("epsilon_file", help="predicted transition matrix")

    parser.add_argument(
        '-s',
        '--random_seed',
        default=23724839,
        type=int,
        help=
        ("specifies seed for numpy random number generator defaults to 23724839 applied after random filtering"
         ))

    parser.add_argument(
        '-e',
        '--eta_max',
        default=2,
        type=int,
        help=("maximum contig contig count for sampler defaults to 2"))

    parser.add_argument('-i',
                        '--iter_max',
                        default=20,
                        type=int,
                        help=("number of Gibbs sampling iterations"))

    parser.add_argument(
        '-m',
        '--var_max',
        default=1e10,
        type=int,
        help=
        ("maximum number of variants to user per contig defaults to all (1e10) if unset"
         ))

    parser.add_argument('-o',
                        '--output_stub',
                        type=str,
                        default="output",
                        help=("string specifying output file stubs"))

    parser.add_argument(
        '-g',
        '--genomes',
        help=("specify validation file of known genome composition"))

    parser.add_argument(
        '-v',
        '--variant_file',
        help=("specify file of called variants on genes if available"))

    parser.add_argument('--assign_tau', dest='assign_tau', action='store_true')
    parser.set_defaults(assign_tau=False)
    args = parser.parse_args()

    #import ipdb; ipdb.set_trace()

    output_stub = args.output_stub

    log_file_name = output_stub + "_log_file.txt"

    logging.basicConfig(
        filename=log_file_name,
        level=logging.INFO,
        filemode='w',  # Overwrites old log file
        format='%(asctime)s:%(levelname)s:%(name)s:%(message)s')

    #seed random number generators
    logging.info('Seed random number generators = %d' % (args.random_seed))
    prng = RandomState(args.random_seed)
    sampletau.initRNG()
    sampletau.setRNG(args.random_seed)

    #read in data
    logging.info('Read in SCG coverages from %s' % (args.scg_cov_file))
    scg_cov = p.read_csv(args.scg_cov_file, header=0, index_col=0)
    logging.info('Read gamma from %s' % (args.gamma_star_file))
    gamma_star = p.read_csv(args.gamma_star_file, header=0, index_col=0)
    logging.info('Read gene coverages from %s' % (args.cov_file))
    cov = p.read_csv(args.cov_file, header=0, index_col=0)
    logging.info('Read epsilon from %s' % (args.epsilon_file))
    epsilon = p.read_csv(args.epsilon_file, header=0, index_col=0)
    epsilon_matrix = epsilon.as_matrix()

    if args.variant_file is not None:
        logging.info('Read variants from %s' % (args.variant_file))
        variants = p.read_csv(args.variant_file, header=0, index_col=0)
    else:
        variants = None
    gamma_names = gamma_star.index.values
    scg_names = scg_cov.index.values

    intersect_names1 = sorted(intersect(gamma_names, scg_names))
    intersect_names = sorted(intersect(cov.columns.values, intersect_names1))
    logging.info('Found %d samples common to all 3 input files' %
                 (len(intersect_names)))

    scg_cov = scg_cov.reindex(intersect_names)
    gamma_star = gamma_star.reindex(intersect_names)

    total_mean = scg_cov['mean'].as_matrix()
    total_sd = scg_cov['sd'].as_matrix()

    #renormalise gamma matrix
    gamma_star_matrix = gamma_star.as_matrix()
    row_sums = gamma_star_matrix.sum(axis=1)
    gamma_star_matrix = gamma_star_matrix / row_sums[:, np.newaxis]

    delta = np.multiply(gamma_star_matrix, total_mean[:, np.newaxis])

    #reorder coverage matrix
    cov = cov[intersect_names]
    cov_matrix = cov.as_matrix()
    logging.info('Perform KL estimation of contig counts')
    klassign = KLAssign(prng, cov_matrix, delta)
    klassign.factorize()

    if variants is not None:
        expanded_names = expand_sample_names(intersect_names)
        variants_intersect = variants[expanded_names]

        #now apply Gaussian Gibbs sampler
    #import ipdb; ipdb.set_trace()
    etaD = np.rint(klassign.eta)

    etaSampler = es.Eta_Sampler(prng,
                                variants_intersect,
                                cov,
                                gamma_star_matrix,
                                delta,
                                total_sd,
                                epsilon_matrix,
                                etaD,
                                max_iter=args.iter_max,
                                max_eta=args.eta_max,
                                max_var=args.var_max)

    etaSampler.update()

    etaSampler.update()

    #Now assign tau given eta_star
    contig_names = cov.index.tolist()

    if args.assign_tau is True:

        etaSampler.restoreFullVariants()

        etaSampler.calcTauStar(etaSampler.eta_star)

        (tau_star, tau_mean, pos,
         contig_index) = etaSampler.getTauStar(variants)
        V = tau_star.shape[0]

        tau_res = np.reshape(tau_star, (V, etaSampler.G * 4))
        tau_df = p.DataFrame(tau_res, index=contig_index)
        tau_df['Position'] = pos
        cols = tau_df.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        tau_df = tau_df[cols]
        tau_df.to_csv(output_stub + "_tau_star.csv")
        logging.info("Wrote tau star haplotype predictions")

        tau_mean_res = np.reshape(tau_mean, (V, etaSampler.G * 4))
        tau_mean_df = p.DataFrame(tau_mean_res, index=contig_index)
        tau_mean_df['Position'] = pos
        cols = tau_mean_df.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        tau_mean_df = tau_mean_df[cols]
        tau_mean_df.to_csv(output_stub + "_tau_mean.csv")
        logging.info("Wrote tau mean haplotype predictions")

    etaD_df = p.DataFrame(etaD, index=contig_names)
    etaD_df.to_csv(output_stub + "etaD_df.csv")

    etaS_df = p.DataFrame(etaSampler.eta_star, index=contig_names)
    etaS_df.to_csv(output_stub + "etaS_df.csv")

    etaM_df = p.DataFrame(np.mean(etaSampler.eta_store, axis=0),
                          index=contig_names)
    etaM_df.to_csv(output_stub + "etaM_df.csv")

    eta_df = p.DataFrame(klassign.eta, index=contig_names)
    eta_df.to_csv(output_stub + "eta_df.csv")

    if args.genomes:
        genomes = p.read_csv(args.genomes, header=0, index_col=0)
        genomes = genomes.loc[contig_names]
        genomes_M = genomes.as_matrix()
        genomes_D = np.copy(genomes_M)

        #genomes_D[genomes_D < 0.5] = 0.
        #genomes_D[genomes_D >= 0.5] = 1.0

        #etaD[etaD < 0.5] = 0.
        #etaD[etaD >= 0.5] = 1.0

        (dtotal, dacc, dacc_array) = compGenes(etaD, genomes_D)
        (stotal, sacc, sacc_array) = compGenes(etaSampler.eta_star, genomes_D)

        #np.savetxt(sys.stdout, dacc, fmt='%.4f')
        #np.savetxt(sys.stdout, sacc, fmt='%.4f')

        logging.info('KL accurracy = %f' % (dtotal))
        logging.info('Gibbs sampler accurracy = %f' % (stotal))
Пример #2
0
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument("scg_cov_file", help="input core gene coverages")

    parser.add_argument("gamma_star_file", help="input MAP estimate frequencies")

    parser.add_argument("cov_file", help="mean contig/genes coverages")

    parser.add_argument("epsilon_file", help="predicted transition matrix")

    parser.add_argument('-s','--random_seed',default=23724839, type=int, 
        help=("specifies seed for numpy random number generator defaults to 23724839 applied after random filtering"))

    parser.add_argument('-e','--eta_max',default=2, type=int, 
        help=("maximum contig contig count for sampler defaults to 2"))

    parser.add_argument('-i','--iter_max',default=20, type=int, 
        help=("number of Gibbs sampling iterations"))

    parser.add_argument('-m','--var_max',default=1e10, type=int, 
        help=("maximum number of variants to user per contig defaults to all (1e10) if unset"))

    parser.add_argument('-o','--output_stub', type=str, default="output",
        help=("string specifying output file stubs"))
        
    parser.add_argument('-g','--genomes', 
        help=("specify validation file of known genome composition"))

    parser.add_argument('-v','--variant_file', 
        help=("specify file of called variants on genes if available"))

    parser.add_argument('--assign_tau', dest='assign_tau', action='store_true')
    parser.set_defaults(assign_tau=False)
    args = parser.parse_args()

    #import ipdb; ipdb.set_trace()

    output_stub = args.output_stub
    
    log_file_name = output_stub+"_log_file.txt"
        
    logging.basicConfig(
            filename=log_file_name,
            level=logging.INFO,
            filemode='w', # Overwrites old log file
            format='%(asctime)s:%(levelname)s:%(name)s:%(message)s'
            )

    #seed random number generators
    logging.info('Seed random number generators = %d' %(args.random_seed))
    prng = RandomState(args.random_seed)
    sampletau.initRNG()
    sampletau.setRNG(args.random_seed)

    #read in data
    logging.info('Read in SCG coverages from %s' %(args.scg_cov_file))
    scg_cov    = p.read_csv(args.scg_cov_file, header=0, index_col=0)
    logging.info('Read gamma from %s' %(args.gamma_star_file))
    gamma_star = p.read_csv(args.gamma_star_file, header=0, index_col=0)
    logging.info('Read gene coverages from %s' %(args.cov_file))
    cov = p.read_csv(args.cov_file, header=0, index_col=0)
    logging.info('Read epsilon from %s' %(args.epsilon_file))
    epsilon = p.read_csv(args.epsilon_file, header=0, index_col=0)
    epsilon_matrix = epsilon.as_matrix()
    
    if args.variant_file is not None:
        logging.info('Read variants from %s' %(args.variant_file))
        variants = p.read_csv(args.variant_file, header=0, index_col=0)
    else:
        variants = None
    gamma_names = gamma_star.index.values
    scg_names = scg_cov.index.values

    intersect_names1 = sorted(intersect(gamma_names,scg_names))
    intersect_names = sorted(intersect(cov.columns.values,intersect_names1))
    logging.info('Found %d samples common to all 3 input files' %(len(intersect_names)))
    
    scg_cov = scg_cov.reindex(intersect_names)
    gamma_star = gamma_star.reindex(intersect_names)
        
    total_mean = scg_cov['mean'].as_matrix()
    total_sd = scg_cov['sd'].as_matrix()
    
    #renormalise gamma matrix
    gamma_star_matrix = gamma_star.as_matrix()
    row_sums = gamma_star_matrix.sum(axis=1)
    gamma_star_matrix = gamma_star_matrix / row_sums[:, np.newaxis]

    delta = np.multiply(gamma_star_matrix,total_mean[:,np.newaxis])
    
    #reorder coverage matrix
    cov = cov[intersect_names]
    cov_matrix = cov.as_matrix()
    logging.info('Perform KL estimation of contig counts')
    klassign = KLAssign(prng,cov_matrix,delta)
    klassign.factorize()
    
    if variants is not None:
        expanded_names = expand_sample_names(intersect_names)
        variants_intersect = variants[expanded_names]
        
        #now apply Gaussian Gibbs sampler
    #import ipdb; ipdb.set_trace()
    etaD = np.rint(klassign.eta)
 
    etaSampler = es.Eta_Sampler(prng, variants_intersect, cov, gamma_star_matrix, delta, total_sd, epsilon_matrix, etaD,
        max_iter=args.iter_max,max_eta=args.eta_max, max_var=args.var_max)
    
    etaSampler.update()
    
    etaSampler.update()
    
    #Now assign tau given eta_star
    contig_names = cov.index.tolist()
    
    if args.assign_tau is True:
    
        etaSampler.restoreFullVariants()
        
        etaSampler.calcTauStar(etaSampler.eta_star)
    
        (tau_star,tau_mean,pos,contig_index) = etaSampler.getTauStar(variants)
        V = tau_star.shape[0]
        
        tau_res = np.reshape(tau_star,(V,etaSampler.G*4))
        tau_df = p.DataFrame(tau_res,index=contig_index)
        tau_df['Position'] = pos
        cols = tau_df.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        tau_df = tau_df[cols]
        tau_df.to_csv(output_stub+"_tau_star.csv")
        logging.info("Wrote tau star haplotype predictions")
    
        tau_mean_res = np.reshape(tau_mean,(V,etaSampler.G*4))
        tau_mean_df = p.DataFrame(tau_mean_res,index=contig_index)
        tau_mean_df['Position'] = pos
        cols = tau_mean_df.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        tau_mean_df = tau_mean_df[cols]
        tau_mean_df.to_csv(output_stub+"_tau_mean.csv")
        logging.info("Wrote tau mean haplotype predictions")
    
    
    etaD_df = p.DataFrame(etaD,index=contig_names)
    etaD_df.to_csv(output_stub+"etaD_df.csv")

    etaS_df = p.DataFrame(etaSampler.eta_star,index=contig_names)
    etaS_df.to_csv(output_stub+"etaS_df.csv")

    etaM_df = p.DataFrame(np.mean(etaSampler.eta_store,axis=0),index=contig_names)
    etaM_df.to_csv(output_stub+"etaM_df.csv")

    eta_df = p.DataFrame(klassign.eta,index=contig_names)
    eta_df.to_csv(output_stub+"eta_df.csv")   

    if args.genomes:
        genomes    = p.read_csv(args.genomes, header=0, index_col=0)
        genomes = genomes.loc[contig_names]
        genomes_M   = genomes.as_matrix()
        genomes_D = np.copy(genomes_M)
        
        #genomes_D[genomes_D < 0.5] = 0.
        #genomes_D[genomes_D >= 0.5] = 1.0
        
        #etaD[etaD < 0.5] = 0.
        #etaD[etaD >= 0.5] = 1.0
        
        (dtotal, dacc,dacc_array) = compGenes(etaD, genomes_D)
        (stotal, sacc,sacc_array) = compGenes(etaSampler.eta_star, genomes_D)
                    
        #np.savetxt(sys.stdout, dacc, fmt='%.4f')
        #np.savetxt(sys.stdout, sacc, fmt='%.4f')
        
        logging.info('KL accurracy = %f' %(dtotal))
        logging.info('Gibbs sampler accurracy = %f' %(stotal))