예제 #1
0
def _get_genotype_data_(p_dict):
    if p_dict['data_file']:
        sd = dataParsers.parse_snp_data(p_dict['data_file'] , format=p_dict['data_format'], filter=p_dict['debug_filter'])
    else:
        cm_id = p_dict['call_method_id']
        df = p_dict['data_format']
        #df = df if not cm_id in [78, 79] else 'diploid_int'
        sd = dataParsers.load_snps_call_method(p_dict['call_method_id'], data_format=df, debug_filter=p_dict['debug_filter'])
    return sd
def run_gwas(pid, call_method_id, run_id, kinship_method, debug_filter=1):
        #import snpsdata

        #LOAD DATA
	sd = dp.load_snps_call_method(call_method_id)
	if debug_filter < 1:
		sd.sample_snps(debug_filter)
	phenotype_file = env.env['phen_dir'] + 'phen_with_swedish_082211.csv'
	phed = pd.parse_phenotype_file(phenotype_file)
	phed.convert_to_averages()
	phen_name = phed.get_name(pid)
	sd.coordinate_w_phenotype_data(phed, pid)
        phed.transform(pid, 'most_normal')
	phen_vals = phed.get_values(pid)

	if kinship_method == 'ibd':
		global_k = sd.get_ibd_kinship_matrix()
	elif kinship_method == 'ibs':
		global_k = sd.get_ibs_kinship_matrix()

        p_her = phed.get_pseudo_heritability(pid, global_k)
        hist_file = env.env['results_dir'] + '%s_%s_%d_%d_%s_hist.png' % \
                                                (run_id, kinship_method, call_method_id, pid, phen_name)

        phed.plot_histogram(pid, p_her=p_her, png_file=hist_file)

        #Set up GWAS

	#Chromosomes.
	res_dict = lm.chrom_vs_rest_mm(phen_vals, sd, kinship_method, global_k)
	print res_dict
	file_prefix = env.env['results_dir'] + '%s_loc_v_glob_chrom_%s_%d_%d_%s' % \
						(run_id, kinship_method, call_method_id, pid, phen_name)
	res_file_name = file_prefix + '.csv'
	_write_res_dict_to_file_2_(res_file_name, res_dict)

	#Now 'normal' window sizes
	for ws in [3000000, 1000000, 500000, 200000, 100000, 50000, 20000]:
		file_prefix = env.env['results_dir'] + '%s_loc_v_glob_%s_%d_%d_%d_%s' % \
							(run_id, kinship_method, call_method_id, ws, pid, phen_name)
		res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix, ws, ws / 2, kinship_method, global_k)
		res_file_name = file_prefix + '.csv'
		_write_res_dict_to_file_(res_file_name, res_dict)

	#Now gene-centralized.
	for radius in [20000, 10000, 5000]:
		file_prefix = env.env['results_dir'] + '%s_loc_v_glob_gene_%s_%d_%d_%d_%s' % \
							(run_id, kinship_method, call_method_id, radius, pid, phen_name)
		res_dict = lm.local_vs_global_gene_mm_scan(phen_vals, sd, file_prefix, radius, kinship_method, global_k)
		res_file_name = file_prefix + '.csv'
		_write_res_dict_to_file_3_(res_file_name, res_dict)

        sd.filter_mac_snps(15)
        file_prefix = env.env['results_dir'] + '%s_emmax_stepwise_%s_%d_%d_%s' % \
                                                (run_id, kinship_method, call_method_id, pid, phen_name)
        lm.emmax_step_wise(phen_vals, global_k, sd=sd, num_steps=10, file_prefix=file_prefix, save_pvals=True)
def identify_interesting_haplotypes(chrom_pos_list, phenotype_file, pid):
    import dataParsers as dp
    import bisect
    sd = dp.load_snps_call_method(76) #Full sequence data.
    phed = pd.get_phenotypes_from_db([pid])
    phed.convert_to_averages()
    sd.coordinate_w_phenotype_data(phed, pid)
    cpl = sd.getChrPosList()
    all_snps = sd.getSnps()
    snps = []
    snp_chromosomes = []
    snp_positions = []
    for chrom_pos in chrom_pos_list:
        i = bisect.bisect(cpl, chrom_pos) - 1
        if cpl[i] != chrom_pos:
            raise Exception('SNP not found')
        snps.append(all_snps[i])
        snp_chromosomes.append(chrom_pos[0])
        snp_positions.append(chrom_pos[1])
    sd = dp.load_snps_call_method(76)
    identify_interesting_accessions(sd, snps, snp_chromosomes, snp_positions, phed.get_ecotypes(pid))
예제 #4
0
def create_diploid_dataset(call_method_id=76,
                           file_name='/tmp/test.csv',
                           coding_type='normal'):

    #Load parent list
    parents = []
    with open(env.env['data_dir'] + 'heterozygous_genotypes.csv') as f:
        f.next()
        for l in f:
            parents.append(map(str.strip, l.split(',')))

    snpsd = dp.load_snps_call_method(call_method_id)
    l = zip(snpsd.accessions, range(len(snpsd.accessions)))
    l.sort()
    l = map(list, zip(*l))
    acc_list = l[0]
    orders = l[1]
    sds = []
    for i, sd in enumerate(snpsd.snpsDataList):
        snps = sp.array(sd.snps, dtype='int8')
        snps_list = []
        p_list = []
        for ps in parents:
            f_id = bisect.bisect(acc_list, ps[0]) - 1
            m_id = bisect.bisect(acc_list, ps[1]) - 1
            if acc_list[f_id] == ps[0] and acc_list[m_id] == ps[1]:
                f_gt = snps[:, orders[f_id]].flatten()
                m_gt = snps[:, orders[m_id]].flatten()
                if coding_type == 'normal':
                    o_gt = f_gt + m_gt
                elif coding_type == 'dominant':
                    o_gt = sp.bitwise_xor(f_gt, m_gt)
                snps_list.append(o_gt)
                p_list.append('%s_%s' % (ps[0], ps[1]))
        snps_list = sp.transpose(sp.array(snps_list, dtype='int8'))
        snps = []
        for s in snps_list:
            snps.append(s)
        sds.append(
            snpsdata.SNPsData(snps,
                              sd.positions,
                              accessions=p_list,
                              chromosome=i + 1))
    sd = snpsdata.SNPsDataSet(sds, [1, 2, 3, 4, 5])
    sd.writeToFile(file_name)
def telomere_example_plots(debug_filter=1.0, pid=1365, call_method_id=78, radius=20000, kinship_method='ibs'):
        genes_of_interest = ['AT1G21390', 'AT1G21400', 'AT1G21410', 'AT1G21420', 'AT1G21430', 'AT1G21440', 'AT1G21450',
                             'AT1G21460', 'AT1G21470', 'AT1G21480', 'AT1G21490']
        sd = dp.load_snps_call_method(call_method_id)
        if debug_filter < 1:
                sd.sample_snps(debug_filter)
        phenotype_file = env.env['phen_dir'] + 'phen_with_swedish_082211.csv'
        phed = pd.parse_phenotype_file(phenotype_file)
        phed.convert_to_averages()
        phen_name = phed.get_name(pid)
        sd.coordinate_w_phenotype_data(phed, pid)
        phed.transform(pid, 'most_normal')
        png_file = env.env['results_dir'] + 'histogram_%s_hist.png' % phed.get_name(pid)
        phed.plot_histogram(pid, png_file=png_file)
        phen_vals = phed.get_values(pid)
        file_prefix = env.env['results_dir'] + 'loc_v_glob_gene_%d_%d_%d_%s' % \
                                                (call_method_id, radius, pid, phen_name)
        res_dict = lm.local_vs_global_gene_mm_scan(phen_vals, sd, file_prefix, radius, kinship_method,
                                                   tair_ids=genes_of_interest, plot_gene_trees=True, ets=sd.accessions)
예제 #6
0
def _perform_gwas_(phen_id,
                   phenData,
                   analysis_method,
                   transformation,
                   genotype,
                   kinship_type,
                   kinshipFile=None,
                   messenger=None,
                   outputfile=None):
    additional_columns = {}
    messenger.update_status(progress=0.0, task_status='Loading genotype data')
    genotypeData = dataParsers.load_snps_call_method(genotype)
    #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype)
    K = None
    messenger.update_status(step=0.05, task_status='Preparing data')
    n_filtered_snps = _prepare_data_(genotypeData, phenData, phen_id)
    phen_vals = phenData.get_values(phen_id)
    if analysis_method in [
            'emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm', 'amm'
    ]:
        #Load genotype file (in binary format)
        sys.stdout.write("Retrieving the Kinship matrix K.\n")
        sys.stdout.flush()
        if kinshipFile:  #Kinship file was supplied..
            messenger.update_status(
                progress=0.15,
                task_status='Loading supplied kinship file: %s' % kinshipFile)
            print 'Loading supplied kinship file: %s' % kinshipFile
            K = kinship.load_kinship_from_file(kinshipFile,
                                               genotypeData.accessions)
        else:
            messenger.update_status(progress=0.15,
                                    task_status='Loading kinship file')
            print 'Loading kinship file.'
            K = kinship.get_kinship(call_method_id=genotype,
                                    method=kinship_type,
                                    n_removed_snps=n_filtered_snps,
                                    remain_accessions=genotypeData.accessions)
            sys.stdout.flush()
            sys.stdout.write("Done!\n")

    snps = genotypeData.getSnps()
    positions = genotypeData.getPositions()
    chromosomes = []
    for i, (s, c) in enumerate(
            itertools.izip(genotypeData.snpsDataList,
                           genotypeData.chromosomes)):
        chromosomes.extend([c] * len(s.snps))
        maf_dict = genotypeData.get_mafs()

    if analysis_method in ['kw']:
        messenger.update_status(progress=0.7, task_status='Performing KW')
        res = util.kruskal_wallis(snps, phen_vals)

    elif analysis_method in ['loc_glob_mm']:
        raise NotImplementedError
    elif analysis_method in ['emma']:
        res = lm.emma(snps, phen_vals, K)
    elif analysis_method in ['emmax', 'amm']:
        d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100)
        res = d['res']
        #additional_columns['stats'] = d['stats']
    elif analysis_method in ['lm']:
        d = lm.lin_reg_step(phen_vals, genotypeData, [])
        res = d['res']
        #additional_columns['stats'] = d['stats']
    else:
        raise Exception('analysis method %s not supported' % analysis_method)

    pvals = res['ps']

    #Calculate Benjamini-Hochberg threshold
    bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05)
    #Calculate Median p-value
    med_pval = agr.calc_median(res['ps'])
    #Calculate the Kolmogorov-Smirnov statistic
    ks_res = agr.calc_ks_stats(res['ps'])

    quantiles_dict = _calculate_qqplot_data_(pvals)
    scores = map(lambda x: -math.log10(x), pvals)

    if analysis_method in ['lm', 'emma', 'emmax', 'amm']:
        additional_columns['genotype_var_perc'] = res['var_perc']
        if 'betas' in res:
            betas = map(list, zip(*res['betas']))
            additional_columns['beta0'] = betas[0]
            if len(betas) > 1:
                additional_columns['beta1'] = betas[1]

    #calculate ld
    if outputfile is None:
        outputfile = "%s.hdf5" % phen_id
    messenger.update_status(progress=0.8,
                            task_status='Processing and saving results')
    _save_hdf5_pval_file(outputfile, analysis_method, transformation,
                         chromosomes, positions, scores, maf_dict['marfs'],
                         maf_dict['mafs'], quantiles_dict, ks_res,
                         bh_thres_d['thes_pval'], med_pval, additional_columns)
예제 #7
0
def load_and_plot_info_files(call_method_id=75, temperature=10, mac_threshold=15, debug_filter=1,
			near_const_filter=20, data_format='binary'):
	import random

	phen_file = '%s_%dC.csv' % (phen_file_prefix, temperature)
	phed = pd.parse_phenotype_file(phen_file, with_db_ids=False)  #load phenotype file
	phed.filter_near_const_phens(near_const_filter)
	phed.convert_to_averages()
	num_traits = phed.num_traits()
	pids = phed.phen_ids
	sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=0.01)
	indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False)  #All phenotypes are ordered the same way, so we pick the first one.
	phed.filter_ecotypes(indices_to_keep, pids=pids)

	print 'Loading the gene annotation dictionary'
	gene_dict = dp.parse_tair_gff_file()
	run_id = 'd081511'
	#run_id = 'rs_%d' % call_method_id


	file_prefix = '/srv/lab/data/rna_seq_083011/%dC/cm_%d/' % (temperature, call_method_id)


	num_genes = 0

	radii = [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0]
	tss_dists = [200000, 100000, 50000, 25000, 10000, 5000, 1000]
	cvt_summary_dict = {'radius':{'avg_cis_trans_var_ratio':[0.0 for r in radii],
					'avg_cis_herit':[0.0 for r in radii],
					'avg_trans_herit':[0.0 for r in radii],
					'counts':[0.0 for td in radii]},
				'radius_herit':{'avg_cis_trans_var_ratio':[0.0 for r in radii],
					'avg_cis_herit':[0.0 for r in radii],
					'avg_trans_herit':[0.0 for r in radii],
					'counts':[0.0 for td in radii]},
				'tss_dist':{'avg_cis_trans_var_ratio':[0.0 for td in tss_dists],
					'avg_cis_herit':[0.0 for td in tss_dists],
					'avg_trans_herit':[0.0 for td in tss_dists],
					'counts':[0.0 for td in tss_dists]}}

	heritabilities = []
	transformations = []
	shapiro_wilk_pvals = []
	tair_ids = []
	pval_infl_dict = {}
	dist_min_pval_dict = {}
	distance_bins = [(0, 5000), (0, 10000), (0, 25000), (0, 50000), (0, 100000), (1, -1), (6, -1)]
	radius_bins = [0, 1000, 5000, 10000, 25000, 50000, 100000]
	bonf_sign_bin_dict = {}
	res_dict = {}
	sign_count = {}
	for mm in ['EX', 'LM', 'KW']:
		pval_infl_dict[mm] = {'kolmogorov_smirnov':[], 'median_pvals':[]}
		dist_min_pval_dict[mm] = {}
		for bin in distance_bins:
			dist_min_pval_dict[mm][bin] = 0
		bonf_sign_bin_dict[mm] = {}
		for bin in radius_bins:
			bonf_sign_bin_dict[mm][bin] = {'count':0.0, 'total':0.0}
		sign_count[mm] = 0

	cofactor_count_dict = {}
	for criteria in ['ebics', 'mbonf', 'min_cof_ppa']:
		cofactor_count_dict[criteria] = {'num_cofactor_list':[], 'bin_counts':sp.zeros(9),
						'num_cis_cofactor_list':[], 'num_found':0}

	pickle_file_dict = {}
	for mm in ['EX', 'LM', 'KW']:
		pickle_file_dict[mm] = {}
		pickle_file_dict[mm]['file_name'] = '%sresults_%s_mac%d.pickled' % (file_prefix, mm, mac_threshold)
		pickle_file_dict[mm]['res_dict'] = {}

	pids = phed.get_pids()
	for i, pid in enumerate(pids):
		tair_id = phed.get_name(pid)
		chrom = int(tair_id[2])
		curr_file_prefix = '%schr_%d/rna_seq_%s_%dC_mac%d_pid%d_%s' % \
					(file_prefix, chrom, run_id, temperature, mac_threshold, pid, tair_id)
		info_file_name = '%s_info.pickled' % curr_file_prefix
		for mm in ['EX', 'LM', 'KW']:
			res_dict[mm] = '%s_%s_.pvals' % (curr_file_prefix, mm)
		if random.random() > debug_filter:
			continue
		if os.path.isfile(info_file_name) and os.path.isfile(res_dict['EX'] + ".pickled") \
				and os.path.isfile(res_dict['LM'] + ".pickled") and os.path.isfile(res_dict['KW'] + ".pickled"):
			print 'Loading info file: %s' % info_file_name
			num_genes += 1
			info_dict = cPickle.load(open(info_file_name)) #Loading the info dict
			for mm in ['EX', 'LM', 'KW']:
				res_dict[mm] = gr.Result(res_dict[mm]) #Loading the result

			#Saving some basic statistics
			transformations.append(info_dict['transformation_type'])
			shapiro_wilk_pvals.append(info_dict['transformation_shapiro_pval'])
			heritabilities.append(info_dict['pseudo_heritability'])

			#cis vs. trans stuff
			cvt_dict = info_dict['CVT']
			for r_i, r in enumerate(radii):
				if cvt_dict['radius'][r] != None:
					pvg = cvt_dict['radius'][r]['perc_var1']
					pvl = cvt_dict['radius'][r]['perc_var2']
					herit = cvt_dict['radius'][r]['pseudo_heritability1']
					cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] += pvl / (pvl + pvg)
					cvt_summary_dict['radius']['avg_cis_herit'][r_i] += pvl * herit
					cvt_summary_dict['radius']['avg_trans_herit'][r_i] += pvg * herit
					cvt_summary_dict['radius']['counts'][r_i] += 1.0

			for r_i, r in enumerate(radii):
				if cvt_dict['radius'][r] != None:
					herit = cvt_dict['radius'][r]['pseudo_heritability1']
					if herit > 0.05:
						pvg = cvt_dict['radius'][r]['perc_var1']
						pvl = cvt_dict['radius'][r]['perc_var2']
						cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] += pvl / (pvl + pvg)
						cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] += pvl * herit
						cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] += pvg * herit
						cvt_summary_dict['radius_herit']['counts'][r_i] += 1.0

			for td_i, td in enumerate(tss_dists):
				if cvt_dict['tss_upstream'][td] != None:
					pvg = cvt_dict['tss_upstream'][td]['perc_var1']
					pvl = cvt_dict['tss_upstream'][td]['perc_var2']
					herit = cvt_dict['tss_upstream'][td]['pseudo_heritability1']
					cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] += pvl / (pvl + pvg)
					cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] += pvl * herit
					cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] += pvg * herit
					cvt_summary_dict['tss_dist']['counts'][td_i] += 1.0



			tair_ids.append(tair_id)
			for mm in ['EX', 'LM', 'KW']:
				pval_infl_dict[mm]['kolmogorov_smirnov'].append(info_dict[mm]['kolmogorov_smirnov']['D'])
				pval_infl_dict[mm]['median_pvals'].append(info_dict[mm]['pval_median'])
				dist_min_pval = tuple(info_dict[mm]['dist_to_min_pval'])
				if res_dict[mm].min_score() < 1 / (20.0 * res_dict[mm].num_scores()):
					sign_count[mm] += 1
					for bin in distance_bins:
						if dist_min_pval <= bin:
							dist_min_pval_dict[mm][bin] += 1
							break

				for bin in radius_bins:
					pval = info_dict[mm]['bin_dict'][bin]['min_pval']
					num_snps = info_dict[mm]['bin_dict'][bin]['num_snps']
					if num_snps > 0:
						bonf_sign_bin_dict[mm][bin]['total'] += 1
						if pval < 1.0 / (20 * num_snps):
							bonf_sign_bin_dict[mm][bin]['count'] += 1

			#Stepwise stuff 
			for criteria in ['ebics', 'mbonf', 'min_cof_ppa']:
				num_cofactors = len(info_dict['SW'][criteria]['cofactors'])
				cofactor_count_dict[criteria]['num_cofactor_list'].append(num_cofactors)
				if num_cofactors > 0:
					cofactor_count_dict[criteria]['num_found'] += 1
					cofactor_count_dict[criteria]['bin_counts'] += sp.array(info_dict['SW'][criteria]['bin_counts'])
					cofactor_count_dict[criteria]['num_cis_cofactor_list'].append(info_dict['SW'][criteria]['bin_counts'][2])


			#Pre-process the results..
			for mm in ['EX', 'LM', 'KW']:
				res = res_dict[mm]
				#Trim results
				res.neg_log_trans()
				if mm == 'EX':
					res.filter_attr('scores', 3) #Filter everything below 10^-2.5
				else:
					res.filter_attr('scores', 4) #Filter everything below 10^-4
				if res.num_scores() == 0:
					print "Skipping file since nothing is below 10^-5"
					continue

				gene_d = gene_dict[tair_id]
				avg_g_pos = (gene_d['start_pos'] + gene_d['end_pos']) / 2.0
				chrom = int(gene_d['chromosome']) #Current gene chromosome


				#Prepare for plotting results.. x,y style, where gene is x, and y is p-values
				chrom_pos_score_dict = res.get_chrom_score_pos_dict()

				dist_dict = {}
				for score_threshold in [5, 6, 7]: #negative log10 thresholds.
					if len(res.snp_results['scores']) == 0:
						dist_dict[score_threshold] = -2 #No results
					else:
						res.filter_attr('scores', score_threshold)
						if len(res.snp_results['scores']) == 0:
							dist_dict[score_threshold] = -2 #No results
						else:
							cps_dict = res.get_chrom_score_pos_dict()
							pos_list = cps_dict[chrom]['positions']
							if len(pos_list) > 0:
								distances = sp.absolute(sp.array(pos_list) - avg_g_pos)
								d_i = sp.argmin(distances)
								dist_dict[score_threshold] = distances[d_i] #Min distance.
							else:
								dist_dict[score_threshold] = -1 #Different chromosome

				pickle_file_dict[mm]['res_dict'][(chrom, avg_g_pos)] = {'tair_id':tair_id,
							'chrom_pos_score':chrom_pos_score_dict, 'dist_dict':dist_dict,
							'pid':pid}
				print dist_dict
		else:
			print "Didn't find file: %s or %s" % (info_file_name, res_dict['EX'] + ".pickled")

	for mm in ['EX', 'LM', 'KW']:
		cPickle.dump(pickle_file_dict[mm]['res_dict'], open(pickle_file_dict[mm]['file_name'], 'wb'), protocol=2)


	for r_i, r in enumerate(radii):
		r_counts = cvt_summary_dict['radius']['counts'][r_i]
		cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] = \
			cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] / r_counts
		cvt_summary_dict['radius']['avg_cis_herit'][r_i] = \
			cvt_summary_dict['radius']['avg_cis_herit'][r_i] / r_counts
		cvt_summary_dict['radius']['avg_trans_herit'][r_i] = \
			cvt_summary_dict['radius']['avg_trans_herit'][r_i] / r_counts


	for r_i, r in enumerate(radii):
		r_counts = cvt_summary_dict['radius_herit']['counts'][r_i]
		cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] = \
			cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] / r_counts
		cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] = \
			cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] / r_counts
		cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] = \
			cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] / r_counts


	for td_i, td in enumerate(tss_dists):
		td_counts = cvt_summary_dict['tss_dist']['counts'][td_i]
		cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] = \
			cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] / td_counts
		cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] = \
			cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] / td_counts
		cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] = \
			cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] / td_counts



	results_prefix = env['results_dir'] + 'RNAseq_summary_%dC_cm%d' % (temperature, call_method_id)

	pylab.figure()
	pylab.plot(cvt_summary_dict['radius']['avg_cis_trans_var_ratio'])
	pylab.ylabel('Avg. perc. of cis genetic var.')
	pylab.xlabel('Dist. from gene (kb)')
	pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0])
	pylab.savefig(results_prefix + '_avg_perc_cis_gen_var_rad.png')
	pylab.clf()

	pylab.figure()
	pylab.plot(cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'])
	pylab.ylabel('Avg. perc. of cis genetic var.')
	pylab.xlabel('Dist. upstream from gene TSS (kb)')
	pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1])
	pylab.savefig(results_prefix + '_avg_perc_cis_gen_var_td.png')
	pylab.clf()

#	pylab.figure()
#	pylab.plot(cvt_summary_dict['tss_dist']['avg_cis_herit'])
#	pylab.ylabel('Avg. cis heritability')
#	pylab.xlabel('Dist. upstream from gene TSS (kb)')
#	pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1])
#	pylab.savefig(results_prefix + 'avg_cis_herit_td.png')
#	pylab.clf()
#
#
#	pylab.figure()
#	pylab.plot(cvt_summary_dict['tss_dist']['avg_trans_herit'])
#	pylab.ylabel('Avg. remaining heritability')
#	pylab.xlabel('Dist. upstream from gene TSS (kb)')
#	pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1])
#	pylab.savefig(results_prefix + 'avg_trans_herit_td.png')
#	pylab.clf()


#	pylab.figure()
#	pylab.plot(cvt_summary_dict['radius']['avg_trans_herit'])
#	pylab.ylabel('Avg. remaining heritability')
#	pylab.xlabel('Dist. from gene (kb)')
#	pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0])
#	pylab.savefig(results_prefix + 'avg_trans_herit_rad.png')
#	pylab.clf()
#
#	pylab.figure()
#	pylab.plot(cvt_summary_dict['radius']['avg_cis_herit'])
#	pylab.ylabel('Avg. cis heritability')
#	pylab.xlabel('Dist. from gene (kb)')
#	pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0])
#	pylab.savefig(results_prefix + 'avg_cis_herit_rad.png')
#	pylab.clf()

	tot_herit = sp.array(cvt_summary_dict['radius']['avg_cis_herit']) + \
		sp.array(cvt_summary_dict['radius']['avg_trans_herit'])
	cis_herit = sp.array(cvt_summary_dict['radius']['avg_cis_herit'])
	pylab.figure(figsize=(10, 6))
	pylab.axes([0.06, 0.08, 0.92, 0.90])
	pylab.fill_between([0, 7], 0, 1, color='#DD3333', alpha=0.8, label='Error')
	pylab.fill_between(sp.arange(8), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance')
	pylab.fill_between(sp.arange(8), 0, cis_herit, color='#2255AA', \
				alpha=0.8, label='Heritable variance (cis)')
	pylab.ylabel('Average partition of variance')
	pylab.xlabel('Dist. from gene (kb)')
	pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0])
	pylab.legend(loc=1, ncol=3, shadow=True)
	pylab.axis([0, 7, 0, 1])
	pylab.savefig(results_prefix + 'avg_herit_rad.png')

	tot_herit = sp.array(cvt_summary_dict['radius_herit']['avg_cis_herit']) + \
		sp.array(cvt_summary_dict['radius_herit']['avg_trans_herit'])
	cis_herit = sp.array(cvt_summary_dict['radius_herit']['avg_cis_herit'])
	pylab.figure(figsize=(10, 6))
	pylab.axes([0.06, 0.08, 0.92, 0.90])
	pylab.fill_between([0, 7], 0, 1, color='#DD3333', alpha=0.8, label='Error')
	pylab.fill_between(sp.arange(8), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance')
	pylab.fill_between(sp.arange(8), 0, cis_herit, color='#2255AA', \
				alpha=0.8, label='Heritable variance (cis)')
	pylab.ylabel('Average partition of variance')
	pylab.xlabel('Dist. from gene (kb)')
	pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0])
	pylab.legend(loc=1, ncol=3, shadow=True)
	pylab.axis([0, 7, 0, 1])
	pylab.savefig(results_prefix + 'avg_herit_2_rad.png')



	tot_herit = sp.array(cvt_summary_dict['tss_dist']['avg_cis_herit']) + \
		sp.array(cvt_summary_dict['tss_dist']['avg_trans_herit'])
	cis_herit = sp.array(cvt_summary_dict['tss_dist']['avg_cis_herit'])
	pylab.figure(figsize=(10, 6))
	pylab.axes([0.06, 0.08, 0.92, 0.90])
	pylab.fill_between([0, 6], 0, 1, color='#DD3333', alpha=0.8, label='Error')
	pylab.fill_between(sp.arange(7), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance')
	pylab.fill_between(sp.arange(7), 0, cis_herit, color='#2255AA', \
				alpha=0.8, label='Heritable variance (cis)')
	pylab.ylabel('Average partition of variance')
	pylab.xlabel('Dist. upstream from gene TSS (kb)')
	pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1])
	pylab.legend(loc=1, ncol=3, shadow=True)
	pylab.axis([0, 6, 0, 1])
	pylab.savefig(results_prefix + 'avg_herit_td.png')



	pylab.figure()
	pylab.hist(heritabilities, bins=20, alpha=0.7)
	pylab.xlabel('Pseudo-heritability')
	pylab.xlim((-0.025, 1.025))
	pylab.savefig(results_prefix + '_herits_hist.png')
	pylab.clf()

	ks_list = []
	pm_list = []
	for mm in ['EX', 'LM', 'KW']:
		ks_list.append(pval_infl_dict[mm]['kolmogorov_smirnov'])
		pm_list.append(pval_infl_dict[mm]['median_pvals'])

	png_file_name = results_prefix + '_kolmogorov_smirnov_boxplot.png'
	pylab.figure()
	pylab.boxplot(ks_list)
	pylab.axhline(0, color='k', alpha=0.6, ls='-.')
	pylab.xticks(range(1, 4), ['EX', 'LM', 'KW'])
	pylab.ylabel('Kolmogorov-Smirnov statistic D.')
	pylab.savefig(png_file_name)
	pylab.clf()


	png_file_name = results_prefix + '_median_pvals_boxplot.png'
	pylab.figure()
	pylab.boxplot(pm_list)
	pylab.axhline(0, color='k', alpha=0.6, ls='-.')
	pylab.xticks(range(1, 4), ['EX', 'LM', 'KW'])
	pylab.ylabel('Median p-value bias')
	pylab.savefig(png_file_name)
	pylab.clf()


	x_positions = sp.arange(len(distance_bins), dtype='d64')
	width = 0.25
	png_file_name = results_prefix + '_dist_min_pval_hist.png'
	pylab.axes([0.08, 0.2, 0.91, 0.75])
	for mm, color in zip(['EX', 'LM', 'KW'], ['b', 'c', 'g']):
		l = [dist_min_pval_dict[mm][bin] for bin in distance_bins]
		tot_sum = sum(l)
		l = map(lambda x: x / float(tot_sum), l)
		pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=mm)
		x_positions += width


	pylab.ylabel('Frequency')
	pylab.xticks(x_positions - 3 * width / 2.0, (r'$d \leq 5$', r'$5< d \leq 10$', r'$10< d \leq 25$', \
						r'$25< d \leq 50$', r'$50< d \leq 100$', r'$d>100$', \
						'Other chrom.'), rotation='45')
	pylab.xlabel('Distance $d$ (kb) to the smallest p-value from the gene.')
	pylab.xlim((-0.25, len(distance_bins)))
	pylab.legend(loc=2)
	pylab.savefig(png_file_name)
	pylab.clf()


	x_positions = sp.arange(len(radius_bins) + 1, dtype='d64')
	width = 0.25
	png_file_name = results_prefix + 'bonf_sign_bin_hist.png'
	pylab.axes([0.08, 0.22, 0.91, 0.73])
	for mm, color in zip(['EX', 'LM', 'KW'], ['b', 'c', 'g']):
		l = [bonf_sign_bin_dict[mm][bin]['count'] / bonf_sign_bin_dict[mm][bin]['total'] for bin in radius_bins]
		l.append(sign_count[mm] / float(num_genes))
		pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=mm)
		x_positions += width


	pylab.ylabel('Fraction of sign. results')
	pylab.xticks(x_positions - 3 * width / 2.0, ('Within gene', r'$d \leq 1$', r'$d \leq 5$', \
						r'$d \leq 10$', r'$d \leq 25$', r'$d \leq 50$', \
						r'$d \leq 100$', 'Whole genome'), rotation='45')
	pylab.xlabel(r'Among SNPs with distance $d$ (kb) from gene.')
	pylab.xlim((-0.25, len(radius_bins) + 1))
	pylab.legend(loc=2)
	pylab.savefig(png_file_name)
	pylab.clf()


	png_file_name = results_prefix + 'cofactor_count_hist.png'
	x_positions = sp.arange(6, dtype='d64')
	width = 0.25
	for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']):
		bin_counts = list(sp.bincount(cofactor_count_dict[criteria]['num_cofactor_list']))
		while len(bin_counts) < 6:
			bin_counts.append(0)
		pylab.bar(x_positions, bin_counts, width, color=color, alpha=0.7, label=criteria)
		x_positions += width
	pylab.xlabel('Number of cofactor SNPs')
	pylab.ylabel('Number of genes')
	pylab.xticks(x_positions - 3 * width / 2.0, ('0', '1', '2', '3', '4', '5'))
	pylab.legend(loc=1)
	pylab.xlim((-0.2, 6))
	pylab.savefig(png_file_name)
	pylab.clf()


	png_file_name = results_prefix + 'cis_cofactor_count_hist.png'
	x_positions = sp.arange(6, dtype='d64')
	for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']):
		bin_counts = list(sp.bincount(cofactor_count_dict[criteria]['num_cis_cofactor_list']))
		while len(bin_counts) < 6:
			bin_counts.append(0)
		pylab.bar(x_positions, bin_counts, width, color=color, alpha=0.7, label=criteria)
		x_positions += width
	pylab.xlabel('Number of cis cofactor SNPs')
	pylab.ylabel('Number of genes')
	pylab.xticks(x_positions - 3 * width / 2.0, ('0', '1', '2', '3', '4', '5'))
	pylab.legend(loc=1)
	pylab.xlim((-0.2, 6))
	pylab.savefig(png_file_name)
	pylab.clf()


	png_file_name = results_prefix + 'cofactor_bin_count_hist.png'
	x_positions = sp.arange(9, dtype='d64')
	width = 0.25
	pylab.axes([0.08, 0.2, 0.91, 0.75])
	for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']):
		cofactor_count_dict[criteria]['bin_counts'] = \
			cofactor_count_dict[criteria]['bin_counts'] / cofactor_count_dict[criteria]['num_found']
		l = list(cofactor_count_dict[criteria]['bin_counts'])
		l.reverse()
		pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=criteria)
		x_positions += width
	pylab.ylabel('Fraction all genes with cofactors.')
	pylab.xlabel(r'Distance $d$ (kb) to cofactor from gene.')
	pylab.xticks(x_positions - 3 * width / 2.0, ('Within gene', r'$1\geq d$', r'$5\geq d$', r'$10\geq d$', \
						r'$25\geq d$', r'$50\geq d$', r'$100\geq d$', \
						r'$d>100$', 'Other chrom.'), rotation='45')
	pylab.xlim((-0.2, 9))
	pylab.legend(loc=2)
	pylab.savefig(png_file_name)
	pylab.clf()
예제 #8
0
def plot(temperature=10, call_method_id=75, mapping_method='EX', mac_threshold=15, min_score=5,
		near_const_filter=20, data_format='binary', plot_data=True):
	#Load in chromosome dict..

	#file_prefix = '/srv/lab/data/rna_seq_062911/%dC/cm_%d/' % (temperature, call_method_id)
	file_prefix = '/srv/lab/data/rna_seq_083011/%dC/cm_%d/' % (temperature, call_method_id)

	results_dict_file = '%sresults_%s_mac%d.pickled' % (file_prefix, mapping_method, mac_threshold)
	res_dict = cPickle.load(open(results_dict_file))

	phen_file = '%s_%dC.csv' % (phen_file_prefix, temperature)
	phed = pd.parse_phenotype_file(phen_file, with_db_ids=False)  #load phenotype file
	phed.filter_near_const_phens(near_const_filter)
	phed.convert_to_averages()
	num_traits = phed.num_traits()
	pids = phed.phen_ids
	sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=0.01)
	indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False)  #All phenotypes are ordered the same way, so we pick the first one.
	phed.filter_ecotypes(indices_to_keep, pids=pids)

	chrom_dict = {}
	for x_chrom in [1, 2, 3, 4, 5]:
		for y_chrom in [1, 2, 3, 4, 5]:
			chrom_dict[(x_chrom, y_chrom)] = {'scores':[], 'x_positions':[], 'y_positions':[],
								'tair_ids':[], 'r2':[], 'mac':[]}
	scores = []
	for x_chrom, x_pos in res_dict:
		d = res_dict[(x_chrom, x_pos)]
		tair_id = d['tair_id']
		for y_chrom in [1, 2, 3, 4, 5]:
			cps_d = d['chrom_pos_score'][y_chrom]
			for i in range(len(cps_d['scores'])):
				s = cps_d['scores'][i]
				if s > min_score:
					if s > 25:
						s = 25
					scores.append(s)
					chrom_dict[(x_chrom, y_chrom)]['scores'].append(s)
					chrom_dict[(x_chrom, y_chrom)]['tair_ids'].append(tair_id)
					chrom_dict[(x_chrom, y_chrom)]['x_positions'].append(x_pos)
					chrom_dict[(x_chrom, y_chrom)]['y_positions'].append(cps_d['positions'][i])

	#Write chrom_dict to file..
	if not plot_data:
		for x_chrom in [1, 2, 3, 4, 5]:
			for y_chrom in [1, 2, 3, 4, 5]:
				file_name = file_prefix + 'result_plots/pvalues_chrom%d_chrom%d_%s_min%d.txt' % (x_chrom, y_chrom, mapping_method, min_score)
				print 'Writing to file:', file_name
				with open(file_name, 'w') as f:
					d = chrom_dict[(x_chrom, y_chrom)]
					f.write('x_position, y_position, score, tair_id\n')
					l = zip(d['x_positions'], d['y_positions'], d['scores'], d['tair_ids'])
					l.sort()
					for t in l:
						f.write('%d,%d,%f,%s\n' % t)



	chrom_sizes = [30425061, 19694800, 23456476, 18578714, 26974904]
	cum_chrom_sizes = [sum(chrom_sizes[:i]) for i in range(5)]
	tot_num_bases = float(sum(chrom_sizes))
	rel_chrom_sizes = map(lambda x: 0.925 * (x / tot_num_bases), chrom_sizes)
	rel_cum_chrom_sizes = map(lambda x: 0.925 * (x / tot_num_bases), cum_chrom_sizes)
	for i in range(5):
		rel_cum_chrom_sizes[i] = rel_cum_chrom_sizes[i] + 0.02 + 0.01 * i

	chromosome_ends = {1:30.425061, 2:19.694800, 3:23.456476, 4:18.578714, 5:26.974904}
	print rel_chrom_sizes, rel_cum_chrom_sizes

	#Filter data..

	#Now plot data!!
	if plot_data:
		alpha = 0.8
		linewidths = 0
		vmin = min_score
		f = pylab.figure(figsize=(40, 35))
		chromosomes = [1, 2, 3, 4, 5]
		plot_file_name = file_prefix + 'result_plots/pvalues_%s_min%d.png' % (mapping_method, min_score)
		label = '$-log_{10}$(p-value)'
		vmax = max(scores)

		for yi, chr2 in enumerate(chromosomes):
			for xi, chr1 in enumerate(chromosomes):

				l = chrom_dict[(chr1, chr2)]['scores']
				if len(l) == 0:
					continue
				ax = f.add_axes([0.96 * (rel_cum_chrom_sizes[xi] + 0.01), rel_cum_chrom_sizes[yi] - 0.02,
						0.96 * (rel_chrom_sizes[xi]), rel_chrom_sizes[yi] ])
				ax.spines['right'].set_visible(False)
				ax.spines['bottom'].set_visible(False)
				#ax.tick_params(fontsize='x-large')
				if xi > 0:
					ax.spines['left'].set_visible(False)
					ax.yaxis.set_visible(False)
				else:
					ax.yaxis.set_ticks_position('left')
					ax.set_ylabel('Chromosome %d (Mb)' % chr2, fontsize='x-large')
				if yi < 4:
					ax.spines['top'].set_visible(False)
					ax.xaxis.set_visible(False)
				else:
					ax.xaxis.set_ticks_position('top')
					ax.xaxis.set_label_position('top')
					ax.set_xlabel('Chromosome %d (Mb)' % chr1, fontsize='x-large')
					#ax.set_xlabel('Chromosome %d' % chr1)

				#l = -sp.log10(l)
				#l = l.tolist()
				l_zxy = zip(l, chrom_dict[(chr1, chr2)]['x_positions'],
					chrom_dict[(chr1, chr2)]['y_positions'])
				l_zxy.sort()
				l = map(list, zip(*l_zxy))
				zs = l[0]
				xs = map(lambda x: x / 1000000.0, l[1])
				ys = map(lambda x: x / 1000000.0, l[2])

				scatter_plot = ax.scatter(xs, ys, c=zs, alpha=alpha, linewidths=linewidths, vmin=vmin,
							vmax=vmax)
				ax.axis([-0.025 * chromosome_ends[chr1], 1.025 * chromosome_ends[chr1],
					- 0.025 * chromosome_ends[chr2], 1.025 * chromosome_ends[chr2]])

		cax = f.add_axes([0.965, 0.7, 0.01, 0.2])
		cb = pylab.colorbar(scatter_plot, cax=cax)
		cb.set_label(label, fontsize='xx-large')
		#cb.set_tick_params(fontsize='x-large')
		f.text(0.005, 0.47, 'Associated SNP position', size='xx-large', rotation='vertical')
		f.text(0.47, 0.988, 'Expressed gene position', size='xx-large')
		print 'Saving figure:', plot_file_name
		f.savefig(plot_file_name, format='png')
예제 #9
0
def run_gwas(file_prefix, phen_file, start_i, stop_i, temperature, mac_threshold=15, filter_threshold=0.02,
		call_method_id=79, data_format='diploid_int', debug_filter=1.0, near_const_filter=20):
	"""
	GWAS
	"""
	phed = pd.parse_phenotype_file(phen_file, with_db_ids=False)  #load phenotype file
	phed.filter_near_const_phens(near_const_filter)
	phed.convert_to_averages()
	num_traits = phed.num_traits()
	pids = phed.phen_ids[start_i :stop_i]
	sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=debug_filter)
	indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False)  #All phenotypes are ordered the same way, so we pick the first one.
	phed.filter_ecotypes(indices_to_keep, pids=pids)
	print len(sd.accessions)
	K = sd.get_ibs_kinship_matrix()
	#K = dp.load_kinship(call_method_id=call_method_id, data_format=data_format, sd=sd, method='ibs')

	sd.filter_mac_snps(mac_threshold)
	snps = sd.getSnps()
	positions = sd.getPositions()
	chromosomes = sd.get_chr_list()
	r = sd.get_mafs()
	macs = r['mafs']
	mafs = r['marfs']

	print 'In total there are %d SNPs to be mapped.' % len(snps)
	gene_dict = dp.parse_tair_gff_file()#_load_genes_list_('rna_seq_031311_%sC' % temperature)
	for i, pid in enumerate(pids):
		if not pid in phed.phen_ids: continue
		gene_tair_id = phed.get_name(pid)
#		exons = []
#		for isoform in d:
#			for exon in isoform['exons']:
#				exons.append((d['chromosome'], exon['start_pos'], exon['end_pos']))


		d = gene_dict[gene_tair_id]
		gene_strand = d['strand']
		try:
			chrom = int(d['chromosome'])
		except Exception:
			raise
		gene = gwaResults.Gene(chromosome=int(d['chromosome']), startPos=d['start_pos'],
				endPos=d['end_pos'], name=gene_tair_id, description=None, dbRef=gene_tair_id,
				tairID=gene_tair_id)
		print i, pid, gene
		curr_file_prefix = '%s_mac%d_pid%d_%s' % (file_prefix, mac_threshold, pid, gene_tair_id)

		trans_type, shapiro_pval = phed.most_normal_transformation(pid)
		print 'Most normal transformation was: %s' % trans_type
		#trans_type = 'None'
		summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':shapiro_pval}
		#summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':0}


		print'Applying Kruskal-Wallis'
		phen_vals = phed.get_values(pid)
		res = util.kruskal_wallis(snps, phen_vals)
		pvals = res['ps'].tolist()
		kw_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes)
		print 'Summarizing KW'
		summary_dict['KW'] = kw_res.get_gene_analysis(gene)
		summary_dict['KW']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps'])
		summary_dict['KW']['pval_median'] = agr.calc_median(res['ps'])


		print 'Applying LM'
		res = lm.linear_model(snps, phen_vals)
		pvals = res['ps'].tolist()
		perc_var_expl = res['var_perc'].tolist()
		lm_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes,
				perc_var_expl=perc_var_expl)
		print 'Summarizing LM'
		summary_dict['LM'] = lm_res.get_gene_analysis(gene)
		summary_dict['LM']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps'])
		summary_dict['LM']['pval_median'] = agr.calc_median(res['ps'])


		print 'Applying EX Stepwise'
		snp_priors = sd.get_cand_genes_snp_priors([gene])
		ex_sw_res = lm.emmax_step_wise(phen_vals, K, macs=macs, mafs=mafs, positions=positions,
					chromosomes=chromosomes, snps=snps, num_steps=5, cand_gene_list=[gene],
					with_qq_plots=False, log_qq_max_val=6.0, save_pvals=True, snp_priors=snp_priors)
		print 'Summarizing the step-wise mixed model'
		pvals = ex_sw_res['first_emmax_res']['ps'].tolist()
		perc_var_expl = ex_sw_res['first_emmax_res']['var_perc'].tolist()
		ex_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes,
				perc_var_expl=perc_var_expl)
		summary_dict['EX'] = ex_res.get_gene_analysis(gene)
		summary_dict['pseudo_heritability'] = ex_sw_res['step_info_list'][0]['pseudo_heritability']
		summary_dict['EX']['kolmogorov_smirnov'] = agr.calc_ks_stats(ex_sw_res['first_emmax_res']['ps'])
		summary_dict['EX']['pval_median'] = agr.calc_median(ex_sw_res['first_emmax_res']['ps'])

		#Does the linear mixed model fit the data better?
		summary_dict['MM_LRT'] = lm.mm_lrt_test(phen_vals, K)

		#FINISH summarizing the stepwise!!!
		summarize_stepwise(summary_dict, gene, ex_sw_res['step_info_list'], ex_sw_res['opt_dict'])

		cvt_dict = {'radius':{}, 'tss_upstream':{}}
		print 'Comparing cis vs. trans kinship'
		#Check 1 mb, 200kb, 100kb, 50kb, 20kb, 10kb, 2kb, 0kb
		for radius in [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0]:
			print radius
			r_start_pos = max(gene.startPos - radius, 0)
			r_end_pos = gene.endPos + radius
			d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)],
							kinship_method='ibs', global_kinship=K)
			reg_k = d['regional_k']
			glob_k = d['global_k']
			if reg_k != None:
				cvt_dict['radius'][radius] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K)
			else:
				cvt_dict['radius'][radius] = None
			print cvt_dict['radius'][radius]

		#Check TSS, 100kb, 50kb,25kb, 10kb,5kb,0kb, (all upstream)
		for dist in [200000, 100000, 50000, 25000, 10000, 5000, 1000]:
			print dist, gene_strand
			if gene_strand == '+':
				r_start_pos = max(gene.startPos - dist, 0)
				r_end_pos = gene.startPos
			else:
				r_start_pos = gene.endPos
				r_end_pos = gene.endPos + dist
			d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)],
							kinship_method='ibs', global_kinship=K)
			reg_k = d['regional_k']
			glob_k = d['global_k']
			if reg_k != None:
				cvt_dict['tss_upstream'][dist] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K)
			else:
				cvt_dict['tss_upstream'][dist] = None
			print cvt_dict['tss_upstream'][dist]

		summary_dict['CVT'] = cvt_dict

		#Write info to file..
		cPickle.dump(summary_dict, open(curr_file_prefix + '_info.pickled', 'w'), protocol=2)

		f_prefix = curr_file_prefix + '_hist'
		phed.plot_histogram(pid, title='Gene expressions for %s' % gene_tair_id,
				png_file=f_prefix + '.png', p_her=summary_dict['pseudo_heritability'],
				x_label='RNA seq expression levels (%s transformed)' % trans_type)
		#Plot GWAs...
		for res, method_name in [(kw_res, 'KW'), (lm_res, 'LM'), (ex_res, 'EX')]:
			res.filter_percentile(filter_threshold, reversed=True)
			res.write_to_file('%s_%s_.pvals' % (curr_file_prefix, method_name), only_pickled=True)
			if ex_res.min_score() < 10e-10:
				#print [cg.tairID for cg in cgs]
				f_prefix = '%s_%s_manhattan' % (curr_file_prefix, method_name)
				res.plot_manhattan(png_file=f_prefix + '.png', percentile=0, cand_genes=[gene],
						plot_bonferroni=True, neg_log_transform=True)
예제 #10
0
def plot_gw_r2_decay(file_prefix,
                     num_random_xs=200,
                     max_dist=1000000,
                     call_method_id=78,
                     mac_filter=15,
                     debug_filter=1):
    """
	Plots r2 decay on the genome-wide scale
	"""
    dtype = 'single'  #To increase matrix multiplication speed... using 32 bits.
    sd = dp.load_snps_call_method(call_method_id=call_method_id,
                                  debug_filter=debug_filter,
                                  min_mac=mac_filter)
    #sd.filter_mac_snps(mac_filter)
    h_inverse_matrix_file = env[
        'data_dir'] + 'snp_cov_mat_h_inv_cm%d.pickled' % (call_method_id)
    if not os.path.isfile(h_inverse_matrix_file):
        K = sd.get_snp_cov_matrix()
        H_sqrt = lm.cholesky(K)
        H_sqrt_inv = (H_sqrt).I
        with file(h_inverse_matrix_file, 'wb') as f:
            cPickle.dump(H_sqrt_inv, f, protocol=2)
    else:
        with file(h_inverse_matrix_file) as f:
            H_sqrt_inv = cPickle.load(f)

    cps_list = sd.getChrPosSNPList()
    x_cps = random.sample(cps_list, num_random_xs)
    y_cps = cps_list
    result_dict = {}
    n = len(sd.accessions)
    print 'Starting calculation'
    sys.stdout.flush()
    dists = []
    r2s = []
    t_r2s = []
    x_macs = []
    y_macs = []
    n_saved = 0
    s1 = time.time()
    for i, (x_c, x_p, x_snp) in enumerate(x_cps):
        print '%d: chromosome=%d, position=%d' % (i, x_c, x_p)
        #Normalize SNP..
        xs = sp.array(x_snp)
        x_mac = sum(xs)
        t_x_snp = sp.dot(((xs - sp.mean(xs)) / sp.std(xs)), H_sqrt_inv).T
        for (y_c, y_p, y_snp) in reversed(y_cps):
            if x_c != y_c:
                continue
            if abs(x_p - y_p) > max_dist:
                continue
            ys = sp.array(y_snp)
            x_macs.append(x_mac)
            y_macs.append(sum(ys))
            (r, pearson_pval) = st.pearsonr(xs, ys)
            r2 = r * r
            t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)), H_sqrt_inv).T
            (t_r, t_pearson_pval) = st.pearsonr(
                t_x_snp, t_y_snp)  #Done twice, but this is fast..
            t_r, t_pearson_pval = float(t_r), float(t_pearson_pval)
            t_r2 = t_r * t_r
            dists.append(abs(x_p - y_p))
            r2s.append(r2)
            t_r2s.append(t_r2)
            n_saved += 1

    time_secs = time.time() - s1
    print 'It took %d minutes and %d seconds to finish.' % (time_secs / 60,
                                                            time_secs % 60)
    print '%d values were saved.' % n_saved
    sys.stdout.flush()

    #Now plotting and binning..
    for m_dist in [50000, 100000, 200000, 500000, 1000000]:
        kbs = m_dist / 1000
        bin_ids = sp.digitize(dists, sp.arange(0, m_dist, m_dist / 100)) - 1
        bin_dict = {}
        for bid in range(100):
            bin_dict[bid] = {'r2s': [], 't_r2s': []}
        filtered_r2s = []
        filtered_t_r2s = []
        filtered_dists = []
        for bid, r2, t_r2, dist in izip(bin_ids, r2s, t_r2s, dists):
            if dist > m_dist:
                continue
            bin_dict[bid]['r2s'].append(r2)
            filtered_r2s.append(r2)
            bin_dict[bid]['t_r2s'].append(t_r2)
            filtered_t_r2s.append(t_r2)
            filtered_dists.append(dist)

        pylab.figure()
        pylab.plot(filtered_dists,
                   filtered_r2s,
                   alpha=0.3,
                   color='k',
                   marker='.',
                   ls='None')
        pylab.xlabel('Distance (bases)')
        pylab.ylabel(r'$r^2$')
        pylab.savefig(file_prefix + '_%dkb_r2s.png' % (kbs))
        pylab.figure()
        pylab.plot(filtered_dists,
                   filtered_t_r2s,
                   alpha=0.3,
                   color='k',
                   marker='.',
                   ls='None')
        pylab.xlabel('Distance (bases)')
        pylab.ylabel(r'$r^2$')
        pylab.savefig(file_prefix + '_%dkb_t_r2s.png' % (kbs))

        r2_avgs = []
        t_r2_avgs = []
        xs = []
        l = sp.arange(0, m_dist, m_dist / 100) + (m_dist / 200)
        for bid in range(100):
            n = len(bin_dict[bid]['r2s'])
            if n > 0:
                r2_avgs.append(sp.sum(bin_dict[bid]['r2s']) / n)
                t_r2_avgs.append(sp.sum(bin_dict[bid]['t_r2s']) / n)
                xs.append(l[bid])

        pylab.figure()
        pylab.plot(xs,
                   r2_avgs,
                   alpha=0.7,
                   color='b',
                   lw=1.8,
                   label=r'standard $r^2$')
        pylab.plot(xs,
                   t_r2_avgs,
                   alpha=0.7,
                   color='m',
                   lw=1.8,
                   label=r'transformed $r^2$')
        pylab.legend(loc=1)
        pylab.xlabel('Distance (bases)')
        pylab.ylabel(r'$r^2$')
        pylab.savefig(file_prefix + '_%dkb_r2s_avgs.png' % (kbs))
예제 #11
0
    def perform_gwas(self, phen_name, dataset,transformation='raw', analysis_method='kw', call_method_id=75,
                     kinship_method='ibs', progress_file_writer=None):

        """
        Performs GWAS and updates the datastructure.
        """

        import bisect
        import gwa
        step_wise = False
        if analysis_method not in ['lm', 'emmax', 'kw']:
            raise Exception('analysis method %s not supported' % analysis_method)

        progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data')
        phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype
        phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}})
        phend.convert_to_averages()
        progress_file_writer.update_progress_bar(task_status='Loading genotype data')
        sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data
        progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data')
        sd.coordinate_w_phenotype_data(phend, 1)
        progress_file_writer.update_progress_bar(progress=0.1,task_status='Filtering monomorphic SNPs')
        sd.filter_monomorphic_snps()
        phen_vals = phend.get_values(1)
        snps = sd.getSnps()
        positions = sd.getPositions()
        chromosomes = []
        progress_file_writer.set_step(0.03)
        for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)):
            progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes)))
            chromosomes.extend([c] * len(s.snps))
        maf_dict = sd.get_mafs()
		

        kwargs = {}
        if analysis_method == 'emmax':
            progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix')
            k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions,
                                scaled=True, min_mac=5, sd=sd)
            progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing EMMAX')
            d = lm.emmax_step(phen_vals, sd, k, [], progress_file_writer=progress_file_writer)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
            res = d['res']
            stats_dict = d['stats']
        elif analysis_method == 'lm':
            progress_file_writer.update_progress_bar(progress=0.3, task_status='Performing LM')
            res = lm.linear_model(snps, phen_vals)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
        elif analysis_method == 'kw':
            progress_file_writer.update_progress_bar(progress=0.7, task_status='Performing KW')
            kw_res = util.kruskal_wallis(snps, phen_vals)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
            scores = map(lambda x:-math.log10(x), kw_res['ps'])
            self.add_results(phen_name, dataset,analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'],
                    maf_dict['mafs'], transformation=transformation, statistics=kw_res['ds'])
        else:
            raise Exception('analysis method %s not supported' % analysis_method)

        if analysis_method in ['lm', 'emmax']:
            if 'betas' in res:
                betas = map(list, zip(*res['betas']))
            else:
                betas = [None, None]
            scores = map(lambda x:-math.log10(x), res['ps'])
            stats_dict['step'] = 0
            cofactors = [stats_dict]
            self.add_results(phen_name, dataset, analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'],
                             maf_dict['mafs'], transformation=transformation,
                             genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1],
                              cofactors=cofactors)
        progress_file_writer.update_progress_bar(progress=1.0, task_status='Done')
        print 'Done!'
        return analysis_method
예제 #12
0
    def _run_otu_wperm(self,
                       file_prefix,
                       phenotype_file,
                       delimiter=',',
                       covariate_file=None,
                       phenotype_id=1,
                       call_method_id=1307,
                       maf_threshold=5,
                       number_of_permutations=10):
        ##
        #                phenotype_file = "/home/GMI/matt.horton/meta/metagenomics/gwas/leaf/16S/min800_cca/phenotypes/leaf.16S.800.2sampPerOTU.rare.cca.abd.2reps.n100.cca.txt"
        #                call_method_id = 1308
        #                maf_threshold = 5
        #                phenotype_id = 1
        #                delimiter = ','

        print "Opening snp and phenotype files."
        sys.stdout.flush()

        if '/' in phenotype_file:
            print "Opening phenotype-file: " + phenotype_file
            phenotype = pd.parse_phenotype_file(
                phenotype_file, delim=delimiter)  #load phenotype file
            results_directory = phenotype_file.partition(
                "phenotypes"
            )  # parse this off of the phenotypeFileName and sub the phenotypes dir for the results dir (which needs to be at the same level!!!)
            results_directory = results_directory[0] + 'results/'
            print "Outputing results to: " + results_directory
        else:
            phenotype = pd.parse_phenotype_file(
                env['phen_dir'] + phenotype_file,
                delim=delimiter)  #load phenotype file
            results_directory = env['results_dir']

        sd = dp.load_snps_call_method(call_method_id=call_method_id,
                                      data_format='binary')
        indices_to_keep = sd.coordinate_w_phenotype_data(
            phenotype, phenotype_id)  #truncate to the phenotype of interest.
        indices_to_keep = indices_to_keep.get('pd_indices_to_keep')

        # determine whether to use mac or maf (I might have to use the mac code after determining what the mac should be from the maf)
        if maf_threshold > 0:
            sd.filter_mac_snps(10)


#                        mac_threshold = int(math.ceil(len(sd.accessions) * (float(maf_threshold) / 100)))
#                        print "Applying maf threshold: " + str(maf_threshold) + "% to " + str(len(sd.accessions)) + " accessions (mac < " + str(mac_threshold) + ")"
#                        sd.filter_mac_snps(mac_threshold)

        phenotype_name = phenotype.get_name(phenotype_id)
        phenotype_values = phenotype.get_values(phenotype_id)
        Z = phenotype.get_incidence_matrix(phenotype_id)

        print "There are: " + str(sd.num_snps()) + " SNPs."
        print "in: " + str(len(sd.accessions)) + " accessions"
        print "and " + str(len(indices_to_keep)) + " observations."
        print "The average number of observations per genotype is " + str(
            float(len(indices_to_keep)) / float(len(sd.accessions)))
        sys.stdout.flush()

        K = sd.get_ibs_kinship_matrix()
        K = sp.matrix(K)
        Z = sp.matrix(Z)

        print "Examining phenotype: '" + phenotype_name + "' (phenotype_id: " + str(
            phenotype_id) + ")."
        print 'Applying Permutation tests.'

        snps = sd.get_snps()

        print "Running %d EMMAX-permutations (writes %d dots)" % (
            number_of_permutations, number_of_permutations)
        s1 = time.time()
        res_perm = self._emmax_permutations(snps,
                                            phenotype_values,
                                            number_of_permutations,
                                            K=K,
                                            Z=Z)
        p_f_list = zip(res_perm['min_ps'], res_perm['max_f_stats'])
        p_f_list.sort()
        print p_f_list[:10]
        threshold = p_f_list[len(p_f_list) / 20]
        res_perm['threshold_05'] = threshold
        print 'Threshold should be:', threshold
        secs = time.time() - s1
        if secs > 60:
            mins = int(secs) / 60
            secs = secs - mins * 60
            print 'Took %d mins and %f seconds.' % (mins, secs)
        else:
            print 'Took %f seconds.' % (secs)

        print "Permutation tests done for phenotype: " + phenotype_name

        results = {}
        results['perm_pval'] = res_perm['min_ps'].tolist()
        results['perm_fstat'] = res_perm['max_f_stats'].tolist()

        output_file = '%s/%s_perm.pvals_pid_%d_%s' % (
            results_directory, file_prefix, phenotype_id, phenotype_name)
        columns = ['perm_pval', 'perm_fstat']
        with open(output_file, "w") as f:
            f.write(','.join(columns) + "\n")
            for i in range(1, (number_of_permutations + 1)):
                l = [results[c][i - 1] for c in columns]
                l = map(str, l)
                f.write(",".join(l) + "\n")
        print "Permutation p-values written."
예제 #13
0
def _perform_gwas_(phen_id,phenData,analysis_method,transformation,genotype,kinship_type,kinshipFile=None,messenger=None,outputfile=None):
    additional_columns = {}
    messenger.update_status(progress=0.0, task_status='Loading genotype data')
    genotypeData = dataParsers.load_snps_call_method(genotype)
    #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype)
    K = None
    messenger.update_status(step=0.05, task_status='Preparing data')
    n_filtered_snps = _prepare_data_(genotypeData,phenData,phen_id)
    phen_vals = phenData.get_values(phen_id)
    if analysis_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm','amm']:
        #Load genotype file (in binary format)
        sys.stdout.write("Retrieving the Kinship matrix K.\n")
        sys.stdout.flush()
        if kinshipFile:   #Kinship file was supplied..
            messenger.update_status(progress=0.15, task_status='Loading supplied kinship file: %s' % kinshipFile)
            print 'Loading supplied kinship file: %s' % kinshipFile
            K = kinship.load_kinship_from_file(kinshipFile, genotypeData.accessions)
        else:
            messenger.update_status(progress=0.15, task_status='Loading kinship file')
            print 'Loading kinship file.'
            K = kinship.get_kinship(call_method_id=genotype,
                                            method=kinship_type, n_removed_snps=n_filtered_snps,
                                            remain_accessions=genotypeData.accessions)
            sys.stdout.flush()
            sys.stdout.write("Done!\n")

    snps = genotypeData.getSnps()
    positions = genotypeData.getPositions()
    chromosomes = []
    for i, (s, c) in enumerate(itertools.izip(genotypeData.snpsDataList, genotypeData.chromosomes)):
        chromosomes.extend([c] * len(s.snps))
        maf_dict = genotypeData.get_mafs()
    
    if analysis_method in ['kw']:
        messenger.update_status(progress=0.7, task_status='Performing KW')
        res = util.kruskal_wallis(snps, phen_vals)
        
    elif analysis_method in ['loc_glob_mm']:
        raise NotImplementedError
    elif analysis_method in ['emma']:
        res = lm.emma(snps, phen_vals, K)
    elif analysis_method in ['emmax','amm']:
        d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100)
        res = d['res']
        #additional_columns['stats'] = d['stats']
    elif analysis_method in ['lm']:
        d = lm.lin_reg_step(phen_vals, genotypeData, [])
        res = d['res']
        #additional_columns['stats'] = d['stats']
    else:
        raise Exception('analysis method %s not supported' % analysis_method)
    
    pvals = res['ps']
    
    #Calculate Benjamini-Hochberg threshold
    bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05)
    #Calculate Median p-value
    med_pval = agr.calc_median(res['ps'])
    #Calculate the Kolmogorov-Smirnov statistic
    ks_res = agr.calc_ks_stats(res['ps'])
    
    quantiles_dict = _calculate_qqplot_data_(pvals)
    scores = map(lambda x:-math.log10(x), pvals)
    
    if analysis_method in ['lm', 'emma', 'emmax','amm']:
        additional_columns['genotype_var_perc'] = res['var_perc']
        if 'betas' in res:
            betas = map(list, zip(*res['betas']))
            additional_columns['beta0'] = betas[0]
            if len(betas) > 1:
                additional_columns['beta1'] = betas[1]
    
    #calculate ld
    if outputfile is None:
         outputfile = "%s.hdf5" % phen_id
    messenger.update_status(progress=0.8, task_status='Processing and saving results')
    _save_hdf5_pval_file(outputfile, analysis_method, transformation,chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], 
                         quantiles_dict,ks_res,bh_thres_d['thes_pval'],med_pval,additional_columns)
예제 #14
0
    def perform_stepwise_gwas(self, phen_name, dataset, transformation, analysis_method, result_name, chromosome, position,
                              call_method_id=75, kinship_method='ibs',progress_file_writer=None):

        """
        Performs GWAS and updates the datastructure.
        """

        #if analysis_method not in ['emmax','lm']:
        #    raise Exception("Step-Wise GWAS only possible with emmax or LM")
        snp = ((int(chromosome), int(position)))
        result_group = self.h5file.getNode('/phenotypes/%s/%s/%s/%s' % (phen_name, dataset, transformation, analysis_method))
        result = result_group._f_getChild(result_name)
        cofactors = result._v_attrs.cofactors[:]
        co_var_snps = [(int(factors['chr']), int(factors['pos'])) for factors in cofactors if 'chr' in factors and 'pos' in factors]
        if snp in co_var_snps:
            raise Exception('The SNP %s,%s is already in the result' % chromosome, position)
        co_var_snps.append(snp)
        co_var_snps = set(co_var_snps)
        #for avail_result in result_group._f_iterNodes(classname='Table'):
        #   if set(avail_result._v_attrs.cofactors) == co_var_snps:
        #      raise Exception("There is already a result with the selected snps") 

        new_result_name = "SW_%s" % result_group._v_nchildren
        name = "%s_%s" % (analysis_method, new_result_name)

        import bisect
        import gwa
        if analysis_method not in ['lm', 'emmax', 'kw']:
            raise Exception('analysis method %s not supported' % analysis_method)
        if analysis_method == 'kw':
            analysis_method = 'emmax'
        progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data')
        phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype
        phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}})
        phend.convert_to_averages()
        progress_file_writer.update_progress_bar(task_status='Loading genotype data')
        sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data
        progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data')
        sd.coordinate_w_phenotype_data(phend, 1)
        sd.filter_monomorphic_snps()
        phen_vals = phend.get_values(1)

        snps = sd.getSnps()
        positions = sd.getPositions()
        chromosomes = []
        progress_file_writer.set_step(0.03)
        for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)):
            chromosomes.extend([c] * len(s.snps))
            progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes)))
        maf_dict = sd.get_mafs()


        kwargs = {}
        if analysis_method == 'emmax':
            progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix')
            k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions,
            scaled=True, min_mac=5, sd=sd)
            progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing Step-Wise EMMAX')
            d = lm.emmax_step(phen_vals, sd, k, co_var_snps,progress_file_writer=progress_file_writer)
            progress_file_writer.update_progress_bar(0.95, 'Processing and saving results')
            res = d['res']
            stats_dict = d['stats']
        elif analysis_method == 'lm':
            res = lm.linear_model(snps, phen_vals)
        else:
            raise Exception('analysis method %s not supported' % analysis_method)

        if analysis_method in ['lm', 'emmax']:
            if 'betas' in res:
                betas = map(list, zip(*res['betas']))
            else:
                betas = [None, None]
            scores = map(lambda x:-math.log10(x), res['ps'])

            stats_dict['chr'] = snp[0]
            stats_dict['pos'] = snp[1]
            stats_dict['step'] = len(cofactors)
            cofactors.append(stats_dict)

            self.add_results(phen_name,dataset, analysis_method, name, chromosomes, positions, scores, maf_dict['marfs'],
                    maf_dict['mafs'], transformation=transformation,
                    genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1],
                     cofactors=cofactors, result_name=new_result_name)
        print 'Done!'
        progress_file_writer.update_progress_bar(1.0, 'Done')
        return name
예제 #15
0
def calc_r2_levels(file_prefix,
                   x_start_i,
                   x_stop_i,
                   call_method_id=78,
                   data_format='diploid_int',
                   mac_filter=15,
                   save_threshold=0.2,
                   save_threshold2=0.3,
                   debug_filter=1):
    """
	Returns statistics on LD levels, and plot them.
	"""

    dtype = 'single'  #To increase matrix multiplication speed... using 32 bits.
    sd = dp.load_snps_call_method(call_method_id=call_method_id,
                                  data_format=data_format,
                                  debug_filter=debug_filter,
                                  min_mac=mac_filter)
    #sd.filter_mac_snps(mac_filter)
    h_inverse_matrix_file = env[
        'data_dir'] + 'snp_cov_mat_h_inv_cm%d.pickled' % (call_method_id)
    if not os.path.isfile(h_inverse_matrix_file):
        K = sd.get_snp_cov_matrix()
        H_sqrt = lm.cholesky(K)
        H_sqrt_inv = (H_sqrt).I
        with file(h_inverse_matrix_file, 'wb') as f:
            cPickle.dump(H_sqrt_inv, f, protocol=2)
    else:
        with file(h_inverse_matrix_file) as f:
            H_sqrt_inv = cPickle.load(f)

    cps_list = sd.getChrPosSNPList()
    x_cps = cps_list[x_start_i:x_stop_i]
    y_cps = cps_list
    result_dict = {}
    n = len(sd.accessions)
    print 'Starting calculation'
    sys.stdout.flush()
    hdf5_file_name = file_prefix + '_x_' + str(x_start_i) + '_' + str(
        x_stop_i) + ".hdf5"
    h5_file = h5py.File(hdf5_file_name, 'w')
    for i, (x_c, x_p, x_snp) in enumerate(x_cps):
        print '%d: chromosome=%d, position=%d' % (i, x_c, x_p)
        #Normalize SNP..
        xs = sp.array(x_snp)
        t_x_snp = sp.dot(((xs - sp.mean(xs)) / sp.std(xs)), H_sqrt_inv).T
        s1 = time.time()
        y_cs = []
        y_ps = []
        r2s = []
        t_r2s = []
        ps = []
        t_ps = []
        n_saved = 0
        for (y_c, y_p, y_snp) in reversed(y_cps):
            if (x_c, x_p) < (y_c, y_p):
                ys = sp.array(y_snp)
                mac = ys.sum()
                (r, pearson_pval) = st.pearsonr(xs, ys)
                r2 = r * r
                if x_c == y_c and y_p - x_p <= 50000 and r2 > save_threshold2:
                    t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)),
                                     H_sqrt_inv).T
                    (t_r, t_pearson_pval) = st.pearsonr(
                        t_x_snp, t_y_snp)  #Done twice, but this is fast..
                    t_r, t_pearson_pval = float(t_r), float(t_pearson_pval)
                    t_r2 = t_r * t_r
                    y_cs.append(y_c)
                    y_ps.append(y_p)
                    r2s.append(r2)
                    t_r2s.append(t_r2)
                    ps.append(pearson_pval)
                    t_ps.append(t_pearson_pval)
                    n_saved += 1

                elif ((x_c == y_c and y_p - x_p > 50000)
                      or x_c != y_c) and r2 > save_threshold:
                    t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)),
                                     H_sqrt_inv).T
                    (t_r, t_pearson_pval) = st.pearsonr(
                        t_x_snp, t_y_snp)  #Done twice, but this is fast..
                    t_r, t_pearson_pval = float(t_r), float(t_pearson_pval)
                    t_r2 = t_r * t_r
                    y_cs.append(y_c)
                    y_ps.append(y_p)
                    r2s.append(r2)
                    t_r2s.append(t_r2)
                    ps.append(pearson_pval)
                    t_ps.append(t_pearson_pval)
                    n_saved += 1
            else:
                break
        if n_saved > 0:
            grp = h5_file.create_group('x%d' % i)
            grp.create_dataset("n_saved", data=n_saved)
            grp.create_dataset("x_c", data=x_c)
            grp.create_dataset("x_p", data=x_p)
            grp.create_dataset("x_snp", compression='gzip', data=x_snp)
            grp.create_dataset("y_cs", compression='gzip', data=y_cs)
            grp.create_dataset("y_ps", compression='gzip', data=y_ps)
            grp.create_dataset("r2s", compression='gzip', data=r2s)
            grp.create_dataset("t_r2s", compression='gzip', data=t_r2s)
            grp.create_dataset("ps", compression='gzip', data=ps)
            grp.create_dataset("t_ps", compression='gzip', data=t_ps)

        time_secs = time.time() - s1
        print 'It took %d minutes and %d seconds to finish.' % (time_secs / 60,
                                                                time_secs % 60)
        print '%d values were saved.' % n_saved
        sys.stdout.flush()

    h5_file.close()
예제 #16
0
def get_kinship(call_method_id=75,
                data_format='binary',
                method='ibs',
                n_removed_snps=None,
                remain_accessions=None,
                scaled=True,
                min_mac=5,
                sd=None,
                debug_filter=1,
                return_accessions=False):
    """
    Loads and processes the kinship matrix
    """
    import dataParsers as dp
    import env
    if method == 'ibd':
        if sd != None:
            k = sd.get_ibd_kinship_matrix()
            if scaled:
                k = scale_k(k)
            return k
        else:
            raise NotImplementedError(
                'Currently only IBS kinship matrices are supported')
    elif method == 'ibs':
        if call_method_id:
            file_prefix = '%s%d/kinship_%s_%s' % (
                env.env['cm_dir'], call_method_id, method, data_format)
            kinship_file = file_prefix + '_mac%d.h5py' % min_mac
            if os.path.isfile(kinship_file):
                print('Found kinship file: {}'.format(kinship_file))
                d = load_kinship_from_file(kinship_file, scaled=False)
                k = d['k']
                k_accessions = d['accessions']
                n_snps = d['n_snps']
            else:
                print("Didn't find kinship file: {}, now generating one..".
                      format(kinship_file))
                try:
                    sd = dp.load_snps_call_method(
                        call_method_id=call_method_id,
                        data_format=data_format,
                        min_mac=min_mac,
                        debug_filter=debug_filter)
                except Exception:
                    if sd != None:
                        k = sd.get_ibs_kinship_matrix()
                    if scaled:
                        k = scale_k(k)
                    return k

                k = sd.get_ibs_kinship_matrix()
                k_accessions = sd.accessions
                n_snps = sd.num_snps()
                save_kinship_to_file(kinship_file, k, sd.accessions, n_snps)
            if n_removed_snps != None and remain_accessions != None:
                k = update_k_monomorphic(n_removed_snps,
                                         k,
                                         k_accessions,
                                         n_snps,
                                         remain_accessions,
                                         kinship_type='ibs',
                                         dtype='single')
                if scaled:
                    k = scale_k(k)
                return k
            else:
                if scaled:
                    k = scale_k(k)
                if return_accessions:
                    return k, k_accessions
                else:
                    return k

    else:
        print('Method {} is not implemented'.format(method))
        raise NotImplementedError
예제 #17
0
def test_genotype_data():
    import dataParsers as dp
    sd = dp.load_snps_call_method(75)
    gd = genotype_data('/tmp/test.h5py')