示例#1
0
def load_a_thaliana_genotypes():
    """
    Loads A. thaliana genotypes (Horton et al., 2012) and returns a snps_data object
    """
    import dataParsers as dp
    sd = dp.parse_snp_data('at_data/all_chromosomes_binary.csv')
    return sd
示例#2
0
def load_a_thaliana_genotypes():
    """
    Loads A. thaliana genotypes (Horton et al., 2012) and returns a snps_data object
    """
    import dataParsers as dp
    sd = dp.parse_snp_data('at_data/all_chromosomes_binary.csv')
    return sd
示例#3
0
def _insert_markers_into_db_():
    import bisect, dbutils
    sd_192 = dataParsers.parse_snp_data(
        '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_192_043009.csv')
    sd_t57 = dataParsers.parse_snp_data(
        '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_merged_2010_250K_w_FLC_seq.csv'
    )
    cpl_192 = sd_192.getChrPosList()
    cpsl_t57 = sd_t57.getChrPosSNPList()
    found_count = 0
    conn = dbutils.connect_to_papaya()
    cursor = conn.cursor()
    for c, p, snp in cpsl_t57:
        if c < 5:
            continue
        i = bisect.bisect(cpl_192, (c, p))
        if not cpl_192[i - 1] == (c, p):
            #Check if the SNP is in the DB.
            alleles = list(set(snp))
            sql_statement = "SELECT id FROM stock_250k.snps WHERE chromosome=%d AND position=%d AND \
	                		(allele1='%s' OR allele1='%s');" % (c, p, alleles[0],
                                                       alleles[1])
            print sql_statement
            found = False
            num_rows = int(cursor.execute(sql_statement))
            row = cursor.fetchone()
            if row:
                print row
                found = True
            if not found:
                #Insert SNP in DB.
                snp_name = str(c) + "_" + str(
                    p) + "_" + alleles[0] + "_" + alleles[1]
                sql_statement = "INSERT INTO stock_250k.snps (name, chromosome, position, allele1, allele2)\
		                		 VALUES ('%s',%d,%d,'%s','%s');" % (snp_name, c, p,
                                                        alleles[0], alleles[1])
                print sql_statement
                try:
                    cursor.execute(sql_statement)
                    print "Committing transaction (making changes permanent)."
                    conn.commit()
                except:
                    print "insert failed... moving on"

#Close connection
    cursor.close()
    conn.close()
示例#4
0
def generate_384_snps_illumina_file():
    #sd = dp.parse_snp_data("/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv")
    sd = dp.parse_snp_data(
        "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_192_043009.csv")

    locus_names = []
    locus_sequences = []
    allele_1s = []
    allele_2s = []
    chromosomes = []
    positions = []
    col_alleles = []
    for ci, chromosome in enumerate([1, 2, 3, 4, 5]):
        col = sequences.get_col_sequence(chromosome)
        seq_len = len(col.seq)
        snpsd = sd.snpsDataList[ci]
        for i, pos in enumerate(snpsd.positions):
            start_pos = max(0, pos - 61)
            end_pos = min(seq_len, pos + 60)
            snp = list(set(snpsd.snps[i]))
            if len(snp) == 2 and pos < len(col.seq) and '-' not in snp:
                col_allele = col.seq[pos - 1]
                if col_allele == str(snp[0]):
                    other_allele = str(snp[1])
                else:
                    other_allele = str(snp[0])
                    if col_allele != str(snp[1]):
                        raise Exception
                allele_1s.append(col.seq[pos - 3:pos + 2])
                allele_2s.append(col.seq[pos - 3:pos - 1] + other_allele +
                                 col.seq[pos:pos + 2])
                snp_str = '[' + str(snp[0]) + '/' + str(snp[1]) + ']'
                local_seq = col.seq[start_pos:pos -
                                    1] + snp_str + col.seq[pos:end_pos]
                locus_names.append('c' + str(chromosome) + '_p' + str(pos))
                positions.append(pos)
                locus_sequences.append(local_seq)
                chromosomes.append(chromosome)
                if not col.seq[pos - 1] in snp:
                    print col.seq[pos - 1], snp, chromosome, pos
                col_alleles.append(col.seq[pos - 1])

    import csv
    w = csv.writer(open("/Users/bjarnivilhjalmsson/tmp/test.csv", 'w'))
    #        w.writerow(['Locus_Name','Target_Type','Sequence','Chromosome','Coordinate','Genome_Build_Version',
    #                    'Source','Source_Version','Sequence_Orientation','Plus_Minus'])
    #        w.writerow(['Chromosome','Coordinate','Allele_1','Allele_2','Genome_Build_Version','Sequence_Orientation','Plus_Minus'])
    w.writerow([
        'Chromosome', 'Coordinate', 'Allele_1', 'Allele_2',
        'Genome_Build_Version', 'Sequence_Orientation', 'Plus_Minus'
    ])
    #        for (ln,ls,c,p) in zip(locus_names,locus_sequences,chromosomes,positions):
    #                w.writerow([ln,'SNP',ls,c,p,'TAIR8','TAIR','8','Forward','Plus'])
    for (ln, a1, a2, c, p) in zip(locus_names, allele_1s, allele_2s,
                                  chromosomes, positions):
        w.writerow([c, p, a1, a2, 'TAIR8', 'Forward'])

    print locus_sequences[100:110]
    print col_alleles[100:110]
示例#5
0
def _get_genotype_data_(p_dict):
    if p_dict['data_file']:
        sd = dataParsers.parse_snp_data(p_dict['data_file'] , format=p_dict['data_format'], filter=p_dict['debug_filter'])
    else:
        cm_id = p_dict['call_method_id']
        df = p_dict['data_format']
        #df = df if not cm_id in [78, 79] else 'diploid_int'
        sd = dataParsers.load_snps_call_method(p_dict['call_method_id'], data_format=df, debug_filter=p_dict['debug_filter'])
    return sd
示例#6
0
def _insert_merged_data_in_db_():
    """
	Ad hoc, to fix a problem..
	"""
    sd_t54 = dataParsers.parse_snp_data(
        '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t54.csv',
        filter=0.001)
    d = {}
    for eid, aid in zip(sd_t54.accessions, sd_t54.array_ids):
        d[eid] = aid
    sd_t57 = dataParsers.parse_snp_data(
        '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_merged_2010_250K_w_FLC_seq.csv'
    )
    aids = [d[eid] for eid in sd_t57.accessions]
    #	for sd in sd_t57.snpsDataList:
    #		sd.arrayIds=aids
    sd_t57.arrayIds = aids
    sd_t57.write_to_file_yu_format(
        '/Users/bjarnivilhjalmsson/Projects/Data/250k/call_method_57.tsv')
示例#7
0
def write_simple_toomaijan_file(
        filename="/Users/bjarnivilhjalmsson/tmp/test.csv", window=25):
    sd = dp.parse_snp_data(
        "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t54.csv")

    locus_names = []
    locus_sequences = []
    allele_1s = []
    allele_2s = []
    chromosomes = []
    positions = []
    col_alleles = []
    for ci, chromosome in enumerate([1, 2, 3, 4, 5]):
        col = sequences.get_col_sequence(chromosome)
        seq_len = len(col.seq)
        snpsd = sd.snpsDataList[ci]
        for i, pos in enumerate(snpsd.positions):
            start_pos = max(0, pos - window - 1)
            end_pos = min(seq_len, pos + window)
            snp = list(set(snpsd.snps[i]))
            if len(snp) == 2 and pos < len(col.seq) and '-' not in snp:
                col_allele = col.seq[pos - 1]
                if col_allele == str(snp[0]):
                    other_allele = str(snp[1])
                else:
                    other_allele = str(snp[0])
                    if col_allele != str(snp[1]):
                        raise Exception
                allele_1s.append(col.seq[pos - 3:pos + 2])
                allele_2s.append(col.seq[pos - 3:pos - 1] + other_allele +
                                 col.seq[pos:pos + 2])
                snp_str = '[' + col_allele + '/' + other_allele + ']'
                local_seq = col.seq[start_pos:pos -
                                    1] + snp_str + col.seq[pos:end_pos]
                locus_names.append('c' + str(chromosome) + '_p' + str(pos))
                positions.append(pos)
                locus_sequences.append(local_seq)
                chromosomes.append(chromosome)
                if not col.seq[pos - 1] in snp:
                    print col.seq[pos - 1], snp, chromosome, pos
                col_alleles.append(col.seq[pos - 1])

    import csv
    w = csv.writer(open(filename, 'w'))
    w.writerow([
        'Chromosome', 'Coordinate', 'Sequence', 'Genome_Build_Version',
        'Sequence_Orientation'
    ])
    for (ls, c, p) in zip(locus_sequences, chromosomes, positions):
        w.writerow([c, p, ls, 'TAIR8', 'Forward'])

    print locus_sequences[100:110]
    print col_alleles[100:110]
示例#8
0
def lotus_data_analysis(phenotype_id=1,
                        result_files_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_results',
                        manhattan_plot_file='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_manhattan.png',
                        qq_plot_file_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_qq'):
    """
    Lotus GWAS (data from Stig U Andersen)
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    import dataParsers as dp
    import phenotypeData as pd

    # Load genotypes
    print 'Parsing genotypes'
    sd = dp.parse_snp_data(
        '/Users/bjarnivilhjalmsson/Dropbox/Lotus_GWAS/20140603_NonRep.run2.vcf.matrix.ordered.csv')

    # Load phenotypes
    print 'Parsing phenotypes'
    phend = pd.parse_phenotype_file(
        '/Users/bjarnivilhjalmsson/Dropbox/Lotus_GWAS/141007_FT_portal_upd.csv')

    print 'Box-cox'
    phend.box_cox_transform(1)

    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and
    # phenotypes, leaving only accessions (individuals) which overlap between both,
    # and SNPs that are polymorphic in the resulting subset.
    print 'Coordinating data'
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS/IBD)
#     print 'Calculating kinship'
#     K = kinship.calc_ibd_kinship(sd.get_snps())
#     print K

# Perform mixed model GWAS
    print 'Performing mixed model GWAS'
#     mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

#     mlmm_results = lm.mlmm(phend.get_values(phenotype_id), K, sd=sd,
#                          num_steps=10, file_prefix=result_files_prefix,
# save_pvals=True, pval_file_prefix=result_files_prefix)

    lg_results = lm.local_vs_global_mm_scan(phend.get_values(phenotype_id), sd,
                                            file_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lotus_FT_loc_glob_0.1Mb',
                                            window_size=100000, jump_size=50000, kinship_method='ibd', global_k=None)

#     # Construct a results object
    print 'Processing results'
示例#9
0
def _impute_FLC_192_():
    phed = pd.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )

    d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv"
    d250k_sd = dataParsers.parse_snp_data(d250k_file)
    d250k_sd.filter_accessions(phed.accessions)
    d250k_sd.filter_maf_snps(0.05)

    seq_snpsd = dataParsers.parseCSVData(
        data_dir + "/flc_seqs_aln_imputed_snps_012710.csv")
    seq_snpsd.onlyBinarySnps()

    d250k_sd.snpsDataList[4].compareWith(seq_snpsd)
    d250k_sd.snpsDataList[4].merge_data(seq_snpsd)
示例#10
0
def removing_imputed_snps():
    from bisect import bisect
    chr_pos_list, quality_scores = remove_overlapping_snps()
    sd = dp.parse_snp_data(
        "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t52.csv")
    sd.filter_na_snps()
    sd_chr_pos = sd.getChrPosList()
    new_qs = []
    new_chr_pos = []
    for i, (chr, pos) in enumerate(chr_pos_list):
        j = bisect(sd_chr_pos, (chr, pos))
        if sd_chr_pos[j - 1] != (chr, pos):
            if quality_scores[i] > 0.8:
                new_chr_pos.append((chr, pos))
                new_qs.append(quality_scores[i])
    print len(new_chr_pos), len(new_qs)
    return new_chr_pos, new_qs
示例#11
0
def remove_overlapping_snps():
    from bisect import bisect
    #loading perlegen data.
    import dataParsers as dp
    sd = dp.parse_snp_data(
        "/Users/bjarnivilhjalmsson/Projects/Data/perlegen/perlegen_011609.csv")
    perl_chr_pos = sd.getChrPosList()
    chr_pos_list, quality_scores = load_illumina_results()
    in_perlegen = []
    nearby_snp_counts = []
    for i, (chr, pos) in enumerate(chr_pos_list):
        j = bisect(perl_chr_pos, (chr, pos))
        if perl_chr_pos[j - 1] != (chr, pos):
            in_perlegen.append(False)
        else:
            in_perlegen.append(True)
        k = j - 2
        (n_chr, n_pos) = perl_chr_pos[k]
        n_count = 0
        while pos - n_pos < 61 and n_chr == chr:
            n_count += 1
            k -= 1
            (n_chr, n_pos) = perl_chr_pos[k]

        k = j
        (n_chr, n_pos) = perl_chr_pos[k]
        while n_pos - pos < 61 and n_chr == chr:
            n_count += 1
            k += 1
            (n_chr, n_pos) = perl_chr_pos[k]
        nearby_snp_counts.append(n_count)
        if i % (len(chr_pos_list) / 10) == 0:
            print '%d%% done.' % (((i + 1.0) / len(chr_pos_list)) * 100)
    qc = zip(nearby_snp_counts, chr_pos_list, in_perlegen, quality_scores)
    qc.sort()
    k = bisect(qc, (1, (0, 0), False, 0))
    good_snp_chr_pos = []
    good_q_scores = []
    for n_count, chr_pos, in_perlegen, q_score in qc[:k]:
        if n_count == 0 and in_perlegen:
            good_snp_chr_pos.append(chr_pos)
            good_q_scores.append(q_score)
    print len(good_snp_chr_pos)
    return good_snp_chr_pos, good_q_scores
示例#12
0
def lotus_mixed_model_gwas(phenotype_id=4, phen_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/20181113_136LjAccessionData.csv', 
                           gt_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/all_chromosomes_binary.csv', 
                           pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'):
    """
    Perform mixed model (EMMAX) GWAS for Lotus data
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    import dataParsers as dp
    # Load genotypes
    sd = dp.parse_snp_data(gt_file)

    # Load phenotypes
    import phenotypeData as pd
    phend = pd.parse_phenotype_file(phen_file, with_db_ids=False)
    
    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and 
    # phenotypes, leaving only accessions (individuals) which overlap between both, 
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform mixed model GWAS
    mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

    # Construct a results object
    res = gr.Result(scores=mm_results['ps'], snps_data=sd)

    # Save p-values to file
    res.write_to_file(pvalue_file)

    # Plot Manhattan plot
    res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True,
                       neg_log_transform=True)
    # Plot a QQ-plot
    res.plot_qq(qq_plot_file_prefix)
示例#13
0
def _generate_250K_2010_FLC_data_(impute=True):
    """
	Create a combined version of 
	250K, overlapping with the FLC phenotypes.
	Then merge with 2010 data (including indels).
	Then merge with FLC sequences.
	Impute missing SNPs.
	write to file.
	"""
    import phenotypeData as pd
    import env

    phed = pd.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )

    d2010_file = env.home_dir + "Projects/Data/2010/2010_imputed_012610.csv"
    d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data")
    d2010_sd.filter_accessions(phed.accessions)
    d2010_sd.filter_na_snps()
    d2010_sd.filter_maf_snps(0.05)

    #d250k_file = env.home_dir+"Projects/Data/250k/250K_t54.csv"
    d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv"
    d250k_sd = dataParsers.parse_snp_data(d250k_file)
    d250k_sd.filter_accessions(phed.accessions)
    d250k_sd.filter_maf_snps(0.05)

    d250k_sd.merge_snps_data(d2010_sd)
    d250k_sd.filter_na_accessions()
    d250k_sd.filter_na_snps(0.7)
    d250k_sd.filter_monomorphic_snps()

    ref_seq_name = "raw_ref_col-0"
    ref_start = 3170501
    ref_chr = 5
    seq_file = env.home_dir + "Projects/FLC_analysis/flc_seqs_aln_merged_050410.fasta"
    ad = sequences.readFastaAlignment(seq_file,
                                      ref_seq_name=ref_seq_name,
                                      ref_start=ref_start,
                                      ref_chr=ref_chr,
                                      alignment_type="muscle",
                                      ref_direction=1)
    #	ref_start = 3170500
    #	ad2 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #	ref_start = 3170502
    #	ad3 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    pdb.set_trace()
    r = ad.get_snps(type=0)
    seq_snpsd1 = r['snpsd']
    seq_snpsd1.merge_data(r['indels'], error_threshold=0.0)

    #	r2 = ad2.get_snps(type=0)
    #	seq_snpsd2 = r2['snpsd']
    #	seq_snpsd2.merge_data(r2['indels'],error_threshold=0.0)
    #
    #	r3 = ad3.get_snps(type=0)
    #	seq_snpsd3 = r3['snpsd']
    #	seq_snpsd3.merge_data(r3['indels'],error_threshold=0.0)

    print "Now merging data.."

    d250k_sd.snpsDataList[4].compareWith(seq_snpsd1)
    #	d250k_sd.snpsDataList[4].compareWith(seq_snpsd2)
    #	d250k_sd.snpsDataList[4].compareWith(seq_snpsd3)
    d250k_sd.snpsDataList[4].merge_data(seq_snpsd1, union_accessions=False)
    d250k_sd.filter_na_accessions()
    d250k_sd.filter_na_snps(0.7)
    d250k_sd.filter_monomorphic_snps()
    d250k_sd.snpsDataList[4].impute_data()
    d250k_sd.writeToFile("/tmp/test.csv")
    print "YEAH!"
示例#14
0
def load_phentoype_file_wilczek():
	filename = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/PhenotypeDataWilczek.csv"
	f = open(filename, "r")
	reader = csv.reader(f)
	phenotype_names = reader.next()[2:]
	for i in range(len(phenotype_names)):
		phenotype_names[i] = phenotype_names[i].replace(" ", "_")
	print phenotype_names
	accession_names = []
	accession_ID = []
	for row in reader:
		accession_names.append(row[1].split()[0].lower())
		accession_ID.append(row[0])
	f.close()
	print accession_names
	acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"])
	acc_dict["cibc-5"] = 6908
	acc_dict["wa-1"] = 6978
	acc_dict["gu-0"] = 7149
	acc_dict['Rubezhnoe-1'] = 7323
	print len(acc_dict), acc_dict
	import env
	d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv"
	import dataParsers
	d250k_sd = dataParsers.parse_snp_data(d250k_file)
	ecotypes = []
	key_file = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/unique_id_to_ecotype_id.csv"
	f = open(key_file, "w")
	f.write("unique_id, accession_name, ecotype_id, in_250k_data\n")
	for acc, acc_id in zip(accession_names, accession_ID):
		if not acc in acc_dict or acc_id == 'karl27' or acc_id == 'karl05':
			print "(%s, %s) is missing" % (acc, acc_id)
		else:
			ecotype = acc_dict[acc]
			ecotypes.append(ecotype)
			f.write("%s,%s,%s,%s\n" % (acc_id, acc, str(ecotype), str(str(ecotype) in d250k_sd.accessions)))
	f.close()

	#phenotype_names = reader.next()[2:]
	phenotype_indices = range(2, len(phenotype_names) + 2)
	phenotypes = []	#[acc_id][phenotype_name]
	f = open(filename, "r")
	reader = csv.reader(f)
	reader.next()

	for row in reader:
		#print row
		if row[1].split()[0].lower() in acc_dict:
			phen_vals = []
			for pv in row[2:]:
				if pv == "":
					pv = 'NA'
				else:
					pv = float(pv)
				phen_vals.append(pv)
			phenotypes.append(phen_vals)
		else:
			print "Missing:", row[1]

	phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.tsv", delimiter='\t')
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.csv", delimiter=',')
示例#15
0
def analyzeSNPs():
    import KW, phenotype_parsers, phenotypeData
    import Emma
    result_id = "filtered_imputed"
    data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/"
    #ref_seq_name = "2010_Col-0"
    ref_seq_name = "raw_ref_col-0"
    ref_start = 3170501
    ref_chr = 5
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #ad = sequences.readFastaAlignment(data_dir+"flc_seqs_aln_merged_011810.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #		ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)

    #r = ad.get_snps(type=1)
    #seq_snpsd = r['snpsd']
    #seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA')
    #seq_snpsd.onlyBinarySnps()
    #i_snpsd = r['indels']
    #print indels
    #i_snpsd = i_snpsd.getSnpsData(missingVal='NA')
    #print zip(i_snpsd.positions, i_snpsd.snps)
    #print i_snpsd.accessionsl
    seq_snpsd = dataParsers.parseCSVData(
        data_dir + "/flc_seqs_aln_imputed_snps_012510.csv")[0]
    seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA')

    #	d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv"
    d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_imputed_012610.csv"
    d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data")
    #	d2010_sd.filter_na_accessions()
    d2010_sd.filter_na_snps()
    d2010_sd.convert_2_binary()
    d2010_sd.filter_maf_snps(0.05)
    #kinship_2010 = Emma.calcKinship(d2010_sd.getSnps(0.05))
    d2010_sd = d2010_sd.get_region_snpsd(5, 3140000, 3220000)
    d2010_sd.remove_redundant_snps(w_missing=True)

    d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv"
    snpsd = dataParsers.parse_snp_data(d250k_file)
    snpsd.filter_accessions(seq_snpsd.accessions)
    snpsd.convert_2_binary()
    snpsd.filter_maf_snps(0.05)
    #kinship_250k = Emma.calcKinship(snpsd.getSnps(0.02))

    snpsd = snpsd.get_region_snpsd(5, 3140000, 3220000)
    snpsd.remove_redundant_snps()

    seq_snpsd.remove_accessions(snpsd.accessions)
    seq_snpsd.snpsFilterRare(0.05)
    seq_snpsd.onlyBinarySnps()
    acc_map = []
    for i, acc in enumerate(seq_snpsd.accessions):
        acc_map.append((i, snpsd.accessions.index(acc)))

    seq_snpsd.orderAccessions(acc_map)
    seq_snpsd.remove_redundant_snps(w_missing=True)

    #snpsd.mergeDataUnion(d2010_sd,priority=2,unionType=3)
    #ad.compare_with_snps_data(snpsd) #Something missing here snpsd...?
    #i_snpsd =
    #snpsd.mergeDataUnion(d250k_sd,unionType=3,verbose=True)

    #NOW PERFORM GWAS AND PLOT RESULT!!!!

    phend = phenotypeData.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )
    #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv")
    results_colors = ['blue', 'green', 'red']
    #kinship_matrices = [kinship_250k,kinship_250k,kinship_2010]
    snpsds = [snpsd, seq_snpsd, d2010_sd]
    phenotypeIndices = phend.phenIds
    log_transforms = [1, 2]
    import analyzePhenotype as ap
    import analyzeSNPResult as asr
    import copy

    #	for i in phenotypeIndices:
    #		#ap.drawHistogram(phend,i,pdfFile="/Users/bjarnivilhjalmsson/tmp/hist_"+str(phend.getPhenotypeName(i))+".pdf")
    #		#if i in log_transforms:
    #		phend.logTransform(i)
    #		#print "log transforming"
    #		results = []
    #		filtered_sds=[]
    #		for sd,k in zip(snpsds,kinship_matrices):
    #			new_sd = copy.deepcopy(sd)
    #			res = Emma.run_emma_w_missing_data(new_sd,phend,i,5,k)
    #			res.negLogTransform()
    #			snps_indices_to_keep = res.filterMARF(minMaf=0.1)
    #			print "Got",len(res.scores),len(res.positions),"p-values from Emma."
    #			results.append(res)
    #			#pvals = res.scores
    #			#positions = res.positions
    #			#pp = zip(pvals,positions)
    #			#pp.sort()
    #			#print pp
    #			#import plotResults as pr
    #			#pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf")
    #			new_sd.filter_snp_indices(snps_indices_to_keep)
    #			filtered_sds.append(new_sd)
    #		import regionPlotter as rp
    #		reg_plotter = rp.RegionPlotter()
    #		reg_plotter.plot_small_result(results,results_colors=results_colors,
    #					pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_"+result_id+"_emma_gwas_"+str(phend.getPhenotypeName(i))+".pdf")
    #		for j,(r,sd) in enumerate(zip(results,filtered_sds)):
    #			r_i = r.scores.index(max(r.scores))
    #			phend.plot_marker_box_plot(i,sd,r_i,pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_emma_"+str(phend.getPhenotypeName(i))+"_"+results_colors[j]+".pdf",marker_score=r.scores[r_i])
    #
    phend = phenotypeData.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )  #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv")

    for i in phenotypeIndices:
        results = []
        filtered_sds = []
        for sd in snpsds:
            new_sd = copy.deepcopy(sd)
            res, f_sd = KW.run_kw(new_sd, phend, i, 5)
            filtered_sds.append(f_sd)
            res.negLogTransform()
            print "Got", len(res.scores), len(
                res.positions), "p-values from KW."
            results.append(res)
            #pvals = res.scores
            #positions = res.positions
            #pp = zip(pvals,positions)
            #pp.sort()
            #print pp
            #import plotResults as pr
            #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf")
        import regionPlotter as rp
        reg_plotter = rp.RegionPlotter()
        reg_plotter.plot_small_result(
            results,
            results_colors=results_colors,
            pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_" +
            result_id + "_gwas_" + str(phend.getPhenotypeName(i)) + ".pdf")
        for j, (r, sd) in enumerate(zip(results, filtered_sds)):
            if len(r.scores) != len(sd.snps):
                print "Lengths not equal? %d, %d", (len(r.scores),
                                                    len(sd.snps))
            r_i = r.scores.index(max(r.scores))
            phend.plot_marker_box_plot(
                i,
                sd,
                r_i,
                pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_kw_" +
                str(phend.getPhenotypeName(i)) + "_" + results_colors[j] +
                ".pdf",
                marker_score=r.scores[r_i])
示例#16
0
def plot_local_tree():
    data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/"
    accs_to_keep = _read_tree_accession_file_()
    ref_seq_name = "raw_ref_col-0"
    ref_start = 3170501
    ref_end = 3183000
    ref_chr = 5
    intron_start = 3175600
    intron_stop = 3179100
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    ad = sequences.readFastaAlignment(data_dir +
                                      "flc_seqs_aln_merged_011810.fasta",
                                      ref_seq_name=ref_seq_name,
                                      ref_start=ref_start,
                                      ref_chr=ref_chr,
                                      alignment_type="muscle",
                                      ref_direction=1)
    #ref_seq_name = "ref_2_Col-0"
    #ref_start = 3170001
    #ref_end = 3184000
    #ref_chr = 5
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)

    #r = ad_2010.get_snps(type=1,min_called_fraction=0.1)
    #seq_sd = r['snpsd'] #Raw SNPSs data
    #seq_sd.id = "Sequences"
    #seq_sd.remove_accessions(accs_to_keep,True)
    #seq_sd.filterMonoMorphicSnps()
    #print seq_sd.snps
    #snpsd = seq_sd.getSnpsData(missingVal='NA')
    r = ad.get_snps(type=1)
    seq_snpsd = r['snpsd']
    seq_snpsd.remove_accessions(accs_to_keep, True)
    seq_snpsd.filterMonoMorphicSnps()
    print len(seq_snpsd.snps)
    #i_snpsd = r['indels']

    #TREE and HAPLOTYPES
    import analyzeHaplotype as ah
    start_stop_list = [(3170500, 3183000), (3172000, 3181000),
                       (3172000, 3175000), (3175000, 3178000),
                       (3178000, 3181000), (3176000, 3181000),
                       (intron_start, intron_stop)]
    for start, stop in start_stop_list:
        snpsd = seq_snpsd.get_region_snpsd(start, stop)
        tree_file = "/Users/bjarnivilhjalmsson/tmp/aln_tree_" + str(
            start) + "_" + str(stop) + ".pdf"
        ah.plot_tree(snpsd, tree_file, verbose=False)

    #250K
    d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t43_192.csv"
    d250k = dataParsers.parse_snp_data(d250k_file)
    temp_d250k = snpsdata.RawSnpsData(snps=d250k.getSnps(0.05),
                                      accessions=d250k.accessions)
    tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_full_data_tree.pdf"
    ah.plot_tree(temp_d250k, tree_file, verbose=True)

    d250k_sd = d250k.get_region_snpsd(5, 3140000, 3220000)
    #d250k_sd = dataParsers.parse_snp_data_region(d250k_file,ref_chr,3140000,3220000,id="250K_data")
    start_stop_list = [(3140000, 3220000), (3150000, 3210000),
                       (3170501, 3183000), (3172000, 3181000),
                       (3172000, 3175000), (3175000, 3178000),
                       (3178000, 3181000), (3176000, 3181000),
                       (intron_start, intron_stop)]
    for start, stop in start_stop_list:
        snpsd = d250k_sd.get_region_snpsd(start, stop)
        tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_tree_" + str(
            start) + "_" + str(stop) + ".pdf"
        ah.plot_tree(snpsd, tree_file, verbose=False)
    seq_snpsd.mergeDataUnion(d250k_sd, unionType=1, verbose=True)

    #2010
    d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv"
    d2010_sd = dataParsers.parse_snp_data_region(d2010_file,
                                                 ref_chr,
                                                 3140000,
                                                 3220000,
                                                 id="2010_data")
    d2010_sd.filterMissingSnps(50)
    d2010_sd._convert_to_tg_ecotypes_()
    d2010_sd.mergeDataUnion(d250k_sd, unionType=1, verbose=True)

    d250k_sd.remove_accessions(accs_to_keep, True)
    for start, stop in start_stop_list:
        snpsd = d250k_sd.get_region_snpsd(start, stop)
        tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_filtered_tree_" + str(
            start) + "_" + str(stop) + ".pdf"
        ah.plot_tree(snpsd, tree_file, verbose=False)
    d250k = dataParsers.parse_snp_data(d250k_file)
    d250k.filter_accessions(accs_to_keep, True)
    d250k.filter_monomorphic_snps()
    snps = d250k.getSnps(0.05)
    temp_d250k = snpsdata.RawSnpsData(snps=snps, accessions=d250k.accessions)
    tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_full_data_filtered_tree.pdf"
    ah.plot_tree(temp_d250k, tree_file, verbose=False)

    #Perlegen
    perlegen_file = "/Users/bjarnivilhjalmsson/Projects/Data/perlegen/perlegen_073009.csv"
    perlegen_sd = dataParsers.parse_snp_data_region(perlegen_file,
                                                    ref_chr,
                                                    3140000,
                                                    3220000,
                                                    id="perlegen_data")
    perlegen_sd._convert_to_tg_ecotypes_()
    perlegen_sd.filterMissingSnps(10)
    d2010_sd.mergeDataUnion(perlegen_sd, priority=2, unionType=1, verbose=True)
    seq_snpsd.mergeDataUnion(perlegen_sd,
                             priority=2,
                             unionType=1,
                             verbose=True)

    #250K, 2010, Perlegen TREE
    d2010_sd.filter_accessions_by_NAs(0.9)
    d2010_sd.filterMissingSnps(180)
    d2010_sd.filterMonoMorphicSnps()
    for start, stop in start_stop_list:
        snpsd = d250k_sd.get_region_snpsd(start, stop)
        tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_2010_perlegen_tree_" + str(
            start) + "_" + str(stop) + ".pdf"
        ah.plot_tree(snpsd, tree_file, verbose=False)

    #250K, 2010, Sequences, Perlegen TREE
    seq_snpsd.filterMonoMorphicSnps()
    seq_snpsd.filter_accessions_by_NAs(0.9)
    seq_snpsd.filterMissingSnps(180)
    start_stop_list = [(3170500, 3183000), (3172000, 3181000),
                       (3172000, 3175000), (3175000, 3178000),
                       (3178000, 3181000), (3176000, 3181000),
                       (intron_start, intron_stop)]
    for start, stop in start_stop_list:
        snpsd = seq_snpsd.get_region_snpsd(start, stop)
        tree_file = "/Users/bjarnivilhjalmsson/tmp/Seq_250k_2010_perlegen_tree_" + str(
            start) + "_" + str(stop) + ".pdf"
        ah.plot_tree(snpsd, tree_file, verbose=False)
def map_phenotype(p_i, phed, snps_data_file, mapping_method, trans_method, p_dict):
	phenotype_name = phed.getPhenotypeName(p_i)
	phen_is_binary = phed.isBinary(p_i)
	file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i),
				mapping_method, trans_method, p_dict['remove_outliers'])
	result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method)

	res = None
	sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format'], filter=p_dict['debug_filter'])
	num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'])
	if p_dict['remove_outliers']:
		assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA."

	phen_vals = phed.getPhenVals(p_i)
	snps = sd.getSnps()
	if mapping_method in ['emmax']:
		#Load genotype file (in binary format)
		sys.stdout.write("Retrieving the Kinship matrix K.\n")
		sys.stdout.flush()
		k_file = env['data_dir'] + "kinship_matrix_cm" + str(p_dict['call_method_id']) + ".pickled"
		kinship_file = p_dict['kinship_file']
		if not kinship_file and os.path.isfile(k_file): #Check if corresponding call_method_file is available
			kinship_file = k_file
		if kinship_file:   #Kinship file was somehow supplied..
			print 'Loading supplied kinship'
			k = lm.load_kinship_from_file(kinship_file, sd.accessions)
		else:
			print "No kinship file was found.  Generating kinship file:", k_file
			sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format'])
			snps = sd.getSnps()
			k_accessions = sd.accessions[:]
			if p_dict['debug_filter']:
				import random
				snps = random.sample(snps, int(p_dict['debug_filter'] * len(snps)))
			k = lm.calc_kinship(snps)
			f = open(k_file, 'w')
			cPickle.dump([k, sd.accessions], f)
			f.close()
			num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'])
			k = lm.filter_k_for_accessions(k, k_accessions, sd.accessions)
		sys.stdout.flush()
		sys.stdout.write("Done!\n")

	if p_dict['remove_outliers']:
		assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA."


	#Check whether result already exists.
	if p_dict['use_existing_results']:
		print "\nChecking for existing results."
		result_file = file_prefix + ".pvals"
		if os.path.isfile(result_file):
			res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
			pvals = True
		else:
			result_file = file_prefix + ".scores"
			if os.path.isfile(result_file):
				res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
				pvals = False
		if res:
			print "Found existing results.. (%s)" % (result_file)
		sys.stdout.flush()


	if not res: #If results weren't found in a file... then do GWA.

		sys.stdout.write("Finished loading and handling data!\n")

		print "FIRST STEP: Applying %s to data. " % (mapping_method)
		sys.stdout.flush()
		kwargs = {}
		additional_columns = []
		if mapping_method in ['emmax']:
			res = lm.emmax(snps, phen_vals, k)
		elif mapping_method in ['lm']:
			res = lm.linear_model(snps, phen_vals)
		else:
			print "Mapping method", mapping_method, 'was not found.'
			sys.exit(2)

		if mapping_method in ['lm', 'emmax']:
			kwargs['genotype_var_perc'] = res['var_perc']
			betas = map(list, zip(*res['betas']))
			kwargs['beta0'] = betas[0]
			kwargs['beta1'] = betas[1]
			additional_columns.append('genotype_var_perc')
			additional_columns.append('beta0')
			additional_columns.append('beta1')
			pvals = res['ps']
			sys.stdout.write("Done!\n")
			sys.stdout.flush()



		kwargs['correlations'] = calc_correlations(snps, phen_vals)
		additional_columns.append('correlations')

		res = gwaResults.Result(scores=pvals, snps_data=sd, name=result_name, **kwargs)

		if mapping_method in ["emmax", 'lm']:
		 	result_file = file_prefix + ".pvals"
		else:
		 	result_file = file_prefix + ".scores"
		res.write_to_file(result_file, additional_columns)

		print "Generating a GW plot."
		sys.stdout.flush()
		png_file = file_prefix + "_gwa_plot.png"
		#png_file_max30 = file_prefix+"_gwa_plot_max30.png"
		if mapping_method in ['lm', "emmax"]:
			res.neg_log_trans()
			if mapping_method in ["kw", "ft"]:# or p_dict['data_format'] != 'binary':
				#res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", 
				#	       plot_bonferroni=True,max_score=30)
				res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$",
					       plot_bonferroni=True)
			else:
				if res.filter_attr("mafs", p_dict['mac_threshold']) > 0:
					#res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", 
					#	       plot_bonferroni=True,max_score=30)				
					res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$",
						       plot_bonferroni=True)
		else:
			pass

		print "plotting histogram"
		hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers'])
		hist_png_file = hist_file_prefix + "_hist.png"
		phed.plot_histogram(p_i, pngFile=hist_png_file)
	else:
		res.neg_log_trans()
		assert res.filter_attr("mafs", p_dict['mac_threshold']), 'All SNPs have MAC smaller than threshold'


	print "SECOND STEP:"
	res.filter_top_snps(p_dict['second_step_number'])
	snps = res.snps
	positions = res.positions
	chromosomes = res.chromosomes
	#Checking res_file exists
	file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i),
				mapping_method, trans_method, p_dict['remove_outliers'], p_dict['second_step_number'])
	res_file = file_prefix + '_res.cpickled'
	if p_dict['use_existing_results'] and os.path.isfile(res_file):
			print 'Found existing results for the second step... loading.'
			f = open(res_file, 'rb')
			second_res = cPickle.load(f)
			f.close()
	else:
		if mapping_method == 'lm':
			second_res = lm.linear_model_two_snps(snps, phen_vals)
		if mapping_method == 'emmax':
			second_res = lm.emmax_two_snps(snps, phen_vals, k)

		#Pickling results..
		print 'Saving results as pickled file:', res_file
		f = open(res_file, 'wb')
		cPickle.dump(second_res, f, protocol=2)
		f.close()



	#Plotting second step plots:
	score_array = -sp.log10(second_res['ps'])
	p3_score_array = -sp.log10(second_res['p3_ps'])
	p4_score_array = -sp.log10(second_res['p4_ps'])
	import plotResults as pr
	pr.plot_snp_pair_result(chromosomes, positions, score_array, file_prefix + '_scatter')
	pr.plot_snp_pair_result(chromosomes, positions, p3_score_array, file_prefix + '_p3_scatter')
	pr.plot_snp_pair_result(chromosomes, positions, p4_score_array, file_prefix + '_p4_scatter')



	if p_dict['region_plots']:
		import regionPlotter as rp
		regions_results = res.get_top_region_results(p_dict['region_plots'])
		plotter = rp.RegionPlotter()
		print "Starting region plots..."
		for reg_res in regions_results:
			chromosome = reg_res.chromosomes[0]
			caption = phenotype_name + "_c" + str(chromosome) + "_" + mapping_method
			png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \
				+ "_e" + str(reg_res.positions[-1]) + ".png"
			tair_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \
				+ "_e" + str(reg_res.positions[-1]) + "_tair_info.txt"
			plotter.plot_small_result([reg_res], png_file=png_file, highlight_gene_ids=tair_ids,
						  caption=caption, tair_file=tair_file)

			#Plot Box-plot
			png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \
				+ "_e" + str(reg_res.positions[-1]) + "_box_plot.png"
			(marker, score, chromosome, pos) = reg_res.get_max_snp()
			marker_accessions = sd.accessions
			phed.plot_marker_box_plot(p_i, marker=marker, marker_accessions=marker_accessions, \
						png_file=png_file, title="c" + str(chromosome) + "_p" + str(pos), \
						marker_score=score, marker_missing_val=sd.missing_val)