def main(): #parse command line arguments parser = argparse.ArgumentParser(description='Performs fetal CNV analysis from maternal plasma and phased parental data.') parser.add_argument('input', type=str, nargs=1, help='path to input file with allele counts in plasma and parental haplotypes') parser.add_argument('target', type=str, nargs=1, help='path to file with background truth - "target"') parser.add_argument('plasma', type=str, nargs=1, help='path to file with plasma sequencing DOC for all chromosomal positions') parser.add_argument('ref', type=str, nargs=1, help='path to file with reference plasma sequencing DOC for all chromosomal positions') parser.add_argument('seq', type=str, nargs=1, help='path to ref. genomic sequence in fasta format') parser.add_argument('param', type=str, nargs=1, help='path to file with method parameters') parser.add_argument('--ff', type=float, help='fetal mixture ratio', default=-1.) parser.add_argument('--useCvrg', help='use coverage flag', action="store_true") parser.add_argument('--trainGrad', type=str, help='train by maxll gradient and output new params to given file', default="") parser.add_argument('--trainMargin', type=str, help='train by max margin and output new params to given file', default="") parser.add_argument('--getObsCounts', help='get observed allele counts', action="store_true") args = parser.parse_args() in_file_name = args.input[0] target_file_name = args.target[0] plasma_doc_file = open(args.plasma[0], "r") ref_doc_file = open(args.ref[0], "r") seq_file = open(args.seq[0], "r") param_file = open(args.param[0], "r") if args.ff > 0: mix = args.ff runGradTraining = False if args.trainGrad != "": runGradTraining = True res_param_file_name = args.trainGrad runMarginTraining = False if args.trainMargin != "": runMarginTraining = True res_param_file_name = args.trainMargin #print input info print "------------------------------------------" print "Running fCNV, input parameters:" print "input:", in_file_name print "target:", target_file_name print "plasma:", plasma_doc_file print "refDOC:", ref_doc_file print "seq:", seq_file print "param:", param_file print "--ff:", args.ff print "--useCvrg:", args.useCvrg print "--trainGrad:", args.trainGrad print "------------------------------------------" os.system("hostname") #read the pre-processed input snp_positions, samples, M, P, MSC, PSC = readInput(in_file_name) #fetch the method parameters from file crfParams = readParams(param_file) print "============ CRF PARAMETERS ==============" for p in sorted(crfParams): print p, "=", crfParams[p] print "==========================================" #get genomic positions on the last lines of the pileup files to estimate the length of the chromosome if args.useCvrg: with open(args.plasma[0], 'rb') as fh: fh.seek(-256, 2) last_pos_plasma = int(fh.readlines()[-1].decode().split(' ')[0]) fh.close() # with open(args.ref[0], 'rb') as fh: # fh.seek(-256, 2) # last_pos_ref = int(fh.readlines()[-1].decode().split(' ')[0]) # fh.close() chr_length = last_pos_plasma + 4742 gc_sum = [0] * chr_length prefix_sum_plasma = [0] * chr_length prefix_count_plasma = [0] * chr_length prefix_sum_ref = [0] * chr_length prefix_count_ref = [0] * chr_length #get GC content prefix sums from the reference gen_pos = 0 keep_reading = True while keep_reading: line = seq_file.readline().strip().upper() if len(line) == 0: break if line[0] == '>': continue for i in range(len(line)): gc_sum[gen_pos] = gc_sum[max(gen_pos - 1, 0)] if line[i] in 'GC': gc_sum[gen_pos] += 1 gen_pos += 1 if gen_pos >= chr_length: keep_reading = False break seq_file.close() last = 0 while True: line = plasma_doc_file.readline() if not line: break row = map(int, line.split(' ')) if row[0] >= chr_length: break prefix_sum_plasma[row[0]] = prefix_sum_plasma[last] + row[1] prefix_count_plasma[row[0]] = prefix_count_plasma[last] + 1 last = row[0] plasma_doc_file.close() last = 0 while True: line = ref_doc_file.readline() if not line: break row = map(int, line.split(' ')) if row[0] >= chr_length: break prefix_sum_ref[row[0]] = prefix_sum_ref[last] + row[1] prefix_count_ref[row[0]] = prefix_count_ref[last] + 1 last = row[0] ref_doc_file.close() #ENDIF #snp_positions = [] ground_truth = [] target_file = open(target_file_name, 'r') for line in target_file.readlines(): line = line.rstrip("\n").split("\t") #snp_positions.append(int(line[0])) ground_truth.append(int(line[-1])) target_file.close() #fcnv = fcnvCRF.FCNV(crfParams, None, None, False) #mix, mix_median, ct = fcnv.estimateMixture(samples, M, P) #del fcnv #print "Est. Mixture: ", mix, mix_median, '(', ct ,')', mix = 0.13 #proportion of fetal genome in plasma if args.ff > 0: mix = args.ff print "used:", mix cnv_prior = None if args.useCvrg: cvrg = cvrgHMM.coverageFCNV(snp_positions, prefix_sum_plasma, prefix_count_plasma, prefix_sum_ref, prefix_count_ref, gc_sum) cvrg_posterior = cvrg.posteriorDecoding(mix) del prefix_sum_plasma, prefix_count_plasma, prefix_sum_ref, prefix_count_ref, gc_sum cnv_prior = [ [0., 0., 0.] for x in range(len(snp_positions)) ] for pos in range(len(cvrg_posterior)): for cp_num_posterior in cvrg_posterior[pos]: cnv_prior[pos][cp_num_posterior[1]] = cp_num_posterior[0] del cvrg, cvrg_posterior fcnv = fcnvCRF.FCNV(crfParams, snp_positions, cnv_prior, args.useCvrg) parameterStats = dict() #get get observed allele counts if args.getObsCounts: parameterStats = fcnv.computeCountsTable(ground_truth, samples, M, P, MSC, PSC, mix, parameterStats) varFileName = target_file_name.split('/')[-1].split('.')[0].replace(':', '-')+'.observedCounts.txt' varFile = open(varFileName, 'w') print >>varFile, len(parameterStats) for expP in parameterStats.keys(): print >>varFile, expP print >>varFile, " ".join(map( lambda x: '/'.join(map(str, x)), parameterStats[expP]) ) varFile.close() return 0 #run gradient training if runGradTraining: #run the training iterations for iterNum in range(1): #print "iterNum: ", iterNum ll, params = fcnv.computeLLandGradient(ground_truth, samples, M, P, MSC, PSC, mix) print ll, params print "------------------------------------------------------------------" #save the trained parameters to the file res_param_file = open(res_param_file_name, "w") for p in sorted(params): if isinstance(params[p], list): print >>res_param_file, p, "=", " ".join(map(str, params[p])) else: print >>res_param_file, p, "=", params[p] res_param_file.close() return 0 #run max margin training if runMarginTraining: #run the training iterations for iterNum in range(5): #change print "iterNum: ", iterNum compute_postloss = True #change + the const C pregts, preps, preloss, params, postgts, postps, postloss = fcnv.computeLLandMaxMarginUpdate(ground_truth, samples, M, P, MSC, PSC, mix, 0.0001, compute_postloss) print preloss, params print "{0} !>= {1}".format(pregts - preps, preloss) if compute_postloss: print "{0} >= {1}".format(postgts - postps, postloss) print "------------------------------------------------------------------\n\n\n" #save the trained parameters to the file res_param_file = open(res_param_file_name, "w") for p in sorted(params): if isinstance(params[p], list): print >>res_param_file, p, "=", " ".join(map(str, params[p])) else: print >>res_param_file, p, "=", params[p] res_param_file.close() return 0 #res_file = open(out_file_name, 'w') file_name_prefix = target_file_name.split('/')[-1].split('.')[0].replace(':', '-') print "------------------ w/o TRAINING -------------------" test(fcnv, snp_positions, samples, M, P, MSC, PSC, mix, ground_truth, file_name_prefix)
def main(): #parse command line arguments parser = argparse.ArgumentParser(description='Performs fetal CNV analysis from maternal plasma and phased parental data.') parser.add_argument('target', type=str, nargs=1, help='path to file with background truth - "target"') parser.add_argument('plasma', type=str, nargs=1, help='path to file with plasma sequencing DOC for all chromosomal positions') parser.add_argument('ref', type=str, nargs=1, help='path to file with reference plasma sequencing DOC for all chromosomal positions') parser.add_argument('seq', type=str, nargs=1, help='path to ref. genomic sequence in fasta format') args = parser.parse_args() target_file_name = args.target[0] plasma_doc_file = open(args.plasma[0], "r") ref_doc_file = open(args.ref[0], "r") seq_file = open(args.seq[0], "r") #get genomic positions on the last lines of the pileup files to estimate the length of the chromosome with open(args.plasma[0], 'rb') as fh: fh.seek(-256, 2) last_pos_plasma = int(fh.readlines()[-1].decode().split(' ')[0]) fh.close() with open(args.ref[0], 'rb') as fh: fh.seek(-256, 2) last_pos_ref = int(fh.readlines()[-1].decode().split(' ')[0]) fh.close() chr_length = max(last_pos_plasma, last_pos_ref) + 4742 gc_sum = [0] * chr_length prefix_sum_plasma = [0] * chr_length prefix_count_plasma = [0] * chr_length prefix_sum_ref = [0] * chr_length prefix_count_ref = [0] * chr_length #get GC content prefix sums from the reference gen_pos = 0 keep_reading = True while keep_reading: line = seq_file.readline().strip().upper() if len(line) == 0: break if line[0] == '>': continue for i in range(len(line)): gc_sum[gen_pos] = gc_sum[max(gen_pos - 1, 0)] if line[i] in 'GC': gc_sum[gen_pos] += 1 gen_pos += 1 if gen_pos >= chr_length: keep_reading = False break seq_file.close() last = 0 for line in plasma_doc_file: row = map(int, line.split(' ')) prefix_sum_plasma[row[0]] = prefix_sum_plasma[last] + row[1] prefix_count_plasma[row[0]] = prefix_count_plasma[last] + 1 last = row[0] plasma_doc_file.close() last = 0 for line in ref_doc_file: row = map(int, line.split(' ')) prefix_sum_ref[row[0]] = prefix_sum_ref[last] + row[1] prefix_count_ref[row[0]] = prefix_count_ref[last] + 1 last = row[0] ref_doc_file.close() snp_positions = [] ground_truth = [] target_file = open(target_file_name, 'r') for line in target_file.readlines(): line = line.rstrip("\n").split("\t") snp_positions.append(int(line[0])) ground_truth.append(int(line[-1])) target_file.close() fcnv = cvrgHMM.coverageFCNV(snp_positions, prefix_sum_plasma, prefix_count_plasma, prefix_sum_ref, prefix_count_ref, gc_sum) mix = 0.13 #proportion of fetal genome in plasma #mix = fcnv.estimateMixture(samples, M, P) print "Est. Mixture: ", mix #res_file = open(out_file_name, 'w') file_name_prefix = target_file_name.split('/')[-1].split('.')[0].replace(':', '-') print "------------------ w/o TRAINING -------------------" test(fcnv, snp_positions, mix, ground_truth, file_name_prefix)
def main(): #parse command line arguments parser = argparse.ArgumentParser( description= 'Performs fetal CNV analysis from maternal plasma and phased parental data.' ) parser.add_argument( 'input', type=str, nargs=1, help= 'path to input file with allele counts in plasma and parental haplotypes' ) parser.add_argument('target', type=str, nargs=1, help='path to file with background truth - "target"') parser.add_argument( 'plasma', type=str, nargs=1, help= 'path to file with plasma sequencing DOC for all chromosomal positions' ) parser.add_argument( 'ref', type=str, nargs=1, help= 'path to file with reference plasma sequencing DOC for all chromosomal positions' ) parser.add_argument('seq', type=str, nargs=1, help='path to ref. genomic sequence in fasta format') parser.add_argument('--ff', type=float, help='fetal mixture ratio', default=-1.) parser.add_argument('--useCvrg', help='use coverage flag', action="store_true") args = parser.parse_args() in_file_name = args.input[0] target_file_name = args.target[0] plasma_doc_file = open(args.plasma[0], "r") ref_doc_file = open(args.ref[0], "r") seq_file = open(args.seq[0], "r") if args.ff > 0: mix = args.ff #print input info print "------------------------------------------" print "Running fCNV, input parameters:" print "input:", in_file_name print "target:", target_file_name print "plasma:", plasma_doc_file print "refDOC:", ref_doc_file print "seq:", seq_file print "--ff:", args.ff print "--useCvrg:", args.useCvrg print "------------------------------------------" os.system("hostname") #read the pre-processed input snp_positions, samples, M, P, MSC, PSC = readInput(in_file_name) #get genomic positions on the last lines of the pileup files to estimate the length of the chromosome with open(args.plasma[0], 'rb') as fh: fh.seek(-256, 2) last_pos_plasma = int(fh.readlines()[-1].decode().split(' ')[0]) fh.close() # with open(args.ref[0], 'rb') as fh: # fh.seek(-256, 2) # last_pos_ref = int(fh.readlines()[-1].decode().split(' ')[0]) # fh.close() chr_length = last_pos_plasma + 4742 gc_sum = [0] * chr_length prefix_sum_plasma = [0] * chr_length prefix_count_plasma = [0] * chr_length prefix_sum_ref = [0] * chr_length prefix_count_ref = [0] * chr_length #get GC content prefix sums from the reference gen_pos = 0 keep_reading = True while keep_reading: line = seq_file.readline().strip().upper() if len(line) == 0: break if line[0] == '>': continue for i in range(len(line)): gc_sum[gen_pos] = gc_sum[max(gen_pos - 1, 0)] if line[i] in 'GC': gc_sum[gen_pos] += 1 gen_pos += 1 if gen_pos >= chr_length: keep_reading = False break seq_file.close() last = 0 while True: line = plasma_doc_file.readline() if not line: break row = map(int, line.split(' ')) if row[0] >= chr_length: break prefix_sum_plasma[row[0]] = prefix_sum_plasma[last] + row[1] prefix_count_plasma[row[0]] = prefix_count_plasma[last] + 1 last = row[0] plasma_doc_file.close() last = 0 while True: line = ref_doc_file.readline() if not line: break row = map(int, line.split(' ')) if row[0] >= chr_length: break prefix_sum_ref[row[0]] = prefix_sum_ref[last] + row[1] prefix_count_ref[row[0]] = prefix_count_ref[last] + 1 last = row[0] ref_doc_file.close() #snp_positions = [] ground_truth = [] target_file = open(target_file_name, 'r') while True: line = target_file.readline() if not line: break line = line.rstrip("\n").split("\t") #snp_positions.append(int(line[0])) ground_truth.append(int(line[-1])) target_file.close() fcnv = fcnvHMM.FCNV(None, None, False) mix, mix_median, ct = fcnv.estimateMixture(samples, M, P) print "Est. Mixture: ", mix, mix_median, '(', ct, ')', #mix = 0.13 #proportion of fetal genome in plasma if args.ff > 0: mix = args.ff print "used:", mix cnv_prior = None if args.useCvrg: cvrg = cvrgHMM.coverageFCNV(snp_positions, prefix_sum_plasma, prefix_count_plasma, prefix_sum_ref, prefix_count_ref, gc_sum) cvrg_posterior = cvrg.posteriorDecoding(mix) # byLL = cvrg.likelihoodDecoding(mix) del prefix_sum_plasma, prefix_count_plasma, prefix_sum_ref, prefix_count_ref, gc_sum cnv_prior = [[0., 0., 0.] for x in range(len(snp_positions))] for pos in range(len(cvrg_posterior)): for cp_num_posterior in cvrg_posterior[pos]: cnv_prior[pos][cp_num_posterior[1]] = cp_num_posterior[0] del cvrg, cvrg_posterior # ll_state = [] # ll_value = [] # for x in byLL[pos]: # ll_state.append(x[1]) # ll_value.append(x[0]) # # ll_str = '' # for j in range(len(ll_state)): # cvrg.logNormalize(ll_value) # ll_str += "%.8f"%math.exp(ll_value[j])+' '+str(ll_state[j])+' | ' # # posterior_str = '' # for x in cvrg_posterior[pos]: # posterior_str += "%.8f"%math.exp(x[0])+' '+str(x[1])+' | ' # posterior_str+='\t' # print snp_positions[pos], 'PP:', posterior_str, 'LL:', ll_str # del cvrg, cvrg_posterior, byLL del fcnv fcnv = fcnvHMM.FCNV(snp_positions, cnv_prior, args.useCvrg) ground_truth = [] target_file = open(target_file_name, 'r') for line in target_file.readlines(): line = line.rstrip("\n").split("\t") ground_truth.append(int(line[-1])) target_file.close() #res_file = open(out_file_name, 'w') file_name_prefix = target_file_name.split('/')[-1].split('.')[0].replace( ':', '-') print "------------------ w/o TRAINING -------------------" test(fcnv, snp_positions, samples, M, P, MSC, PSC, mix, ground_truth, file_name_prefix) #test(fcnv, snp_positions[:1000], samples[:1000], M[:1000], P[:1000], MSC[:1000], PSC[:1000], mix, ground_truth[:1000], file_name_prefix) '''
def main(): #parse command line arguments parser = argparse.ArgumentParser(description='Performs fetal CNV analysis from maternal plasma and phased parental data.') parser.add_argument('input', type=str, nargs=1, help='path to input file with allele counts in plasma and parental haplotypes') parser.add_argument('target', type=str, nargs=1, help='path to file with background truth - "target"') parser.add_argument('plasma', type=str, nargs=1, help='path to file with plasma sequencing DOC for all chromosomal positions') parser.add_argument('ref', type=str, nargs=1, help='path to file with reference plasma sequencing DOC for all chromosomal positions') parser.add_argument('seq', type=str, nargs=1, help='path to ref. genomic sequence in fasta format') parser.add_argument('--ff', type=float, help='fetal mixture ratio', default=-1.) parser.add_argument('--useCvrg', help='use coverage flag', action="store_true") args = parser.parse_args() in_file_name = args.input[0] target_file_name = args.target[0] plasma_doc_file = open(args.plasma[0], "r") ref_doc_file = open(args.ref[0], "r") seq_file = open(args.seq[0], "r") if args.ff > 0: mix = args.ff #print input info print "------------------------------------------" print "Running fCNV, input parameters:" print "input:", in_file_name print "target:", target_file_name print "plasma:", plasma_doc_file print "refDOC:", ref_doc_file print "seq:", seq_file print "--ff:", args.ff print "--useCvrg:", args.useCvrg print "------------------------------------------" os.system("hostname") #read the pre-processed input snp_positions, samples, M, P, MSC, PSC = readInput(in_file_name) #get genomic positions on the last lines of the pileup files to estimate the length of the chromosome with open(args.plasma[0], 'rb') as fh: fh.seek(-256, 2) last_pos_plasma = int(fh.readlines()[-1].decode().split(' ')[0]) fh.close() # with open(args.ref[0], 'rb') as fh: # fh.seek(-256, 2) # last_pos_ref = int(fh.readlines()[-1].decode().split(' ')[0]) # fh.close() chr_length = last_pos_plasma + 4742 gc_sum = [0] * chr_length prefix_sum_plasma = [0] * chr_length prefix_count_plasma = [0] * chr_length prefix_sum_ref = [0] * chr_length prefix_count_ref = [0] * chr_length #get GC content prefix sums from the reference gen_pos = 0 keep_reading = True while keep_reading: line = seq_file.readline().strip().upper() if len(line) == 0: break if line[0] == '>': continue for i in range(len(line)): gc_sum[gen_pos] = gc_sum[max(gen_pos - 1, 0)] if line[i] in 'GC': gc_sum[gen_pos] += 1 gen_pos += 1 if gen_pos >= chr_length: keep_reading = False break seq_file.close() last = 0 while True: line = plasma_doc_file.readline() if not line: break row = map(int, line.split(' ')) if row[0] >= chr_length: break prefix_sum_plasma[row[0]] = prefix_sum_plasma[last] + row[1] prefix_count_plasma[row[0]] = prefix_count_plasma[last] + 1 last = row[0] plasma_doc_file.close() last = 0 while True: line = ref_doc_file.readline() if not line: break row = map(int, line.split(' ')) if row[0] >= chr_length: break prefix_sum_ref[row[0]] = prefix_sum_ref[last] + row[1] prefix_count_ref[row[0]] = prefix_count_ref[last] + 1 last = row[0] ref_doc_file.close() #snp_positions = [] ground_truth = [] target_file = open(target_file_name, 'r') while True: line = target_file.readline() if not line: break line = line.rstrip("\n").split("\t") #snp_positions.append(int(line[0])) ground_truth.append(int(line[-1])) target_file.close() fcnv = fcnvHMM.FCNV(None, None, False) mix, mix_median, ct = fcnv.estimateMixture(samples, M, P) print "Est. Mixture: ", mix, mix_median, '(', ct ,')', #mix = 0.13 #proportion of fetal genome in plasma if args.ff > 0: mix = args.ff print "used:", mix cnv_prior = None if args.useCvrg: cvrg = cvrgHMM.coverageFCNV(snp_positions, prefix_sum_plasma, prefix_count_plasma, prefix_sum_ref, prefix_count_ref, gc_sum) cvrg_posterior = cvrg.posteriorDecoding(mix) # byLL = cvrg.likelihoodDecoding(mix) del prefix_sum_plasma, prefix_count_plasma, prefix_sum_ref, prefix_count_ref, gc_sum cnv_prior = [ [0., 0., 0.] for x in range(len(snp_positions)) ] for pos in range(len(cvrg_posterior)): for cp_num_posterior in cvrg_posterior[pos]: cnv_prior[pos][cp_num_posterior[1]] = cp_num_posterior[0] del cvrg, cvrg_posterior # ll_state = [] # ll_value = [] # for x in byLL[pos]: # ll_state.append(x[1]) # ll_value.append(x[0]) # # ll_str = '' # for j in range(len(ll_state)): # cvrg.logNormalize(ll_value) # ll_str += "%.8f"%math.exp(ll_value[j])+' '+str(ll_state[j])+' | ' # # posterior_str = '' # for x in cvrg_posterior[pos]: # posterior_str += "%.8f"%math.exp(x[0])+' '+str(x[1])+' | ' # posterior_str+='\t' # print snp_positions[pos], 'PP:', posterior_str, 'LL:', ll_str # del cvrg, cvrg_posterior, byLL del fcnv fcnv = fcnvHMM.FCNV(snp_positions, cnv_prior, args.useCvrg) ground_truth = [] target_file = open(target_file_name, 'r') for line in target_file.readlines(): line = line.rstrip("\n").split("\t") ground_truth.append(int(line[-1])) target_file.close() #res_file = open(out_file_name, 'w') file_name_prefix = target_file_name.split('/')[-1].split('.')[0].replace(':', '-') print "------------------ w/o TRAINING -------------------" test(fcnv, snp_positions, samples, M, P, MSC, PSC, mix, ground_truth, file_name_prefix) #test(fcnv, snp_positions[:1000], samples[:1000], M[:1000], P[:1000], MSC[:1000], PSC[:1000], mix, ground_truth[:1000], file_name_prefix) '''