Пример #1
0
def load_phentoype_file_nc_resistance():
	filename = "/Users/bjarnivilhjalmsson/Summary_results_330Arabidopsis_accessions.csv"
	f = open(filename, "r")
	line = map(str.strip, f.next().split(','))
	phenotype_names = line[-2:]
	print phenotype_names
	phenotypes = []
	accession_names = []
	full_accession_names = []
	for l in f:
		line = map(str.strip, l.split(','))
		accession_names.append(line[0].lower())
		full_accession_names.append(line[2].lower())
	f.close()
	print accession_names
	acc_dict = pd.get_accession_to_ecotype_id_dict(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"])
#	acc_dict["cibc-5"] = 6908
#	acc_dict["wa-1"] = 6978
#	acc_dict["gu-0"] = 7149
#	acc_dict['Rubezhnoe-1'] = 7323
	print len(acc_dict), acc_dict
	import env
	d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv.binary"
	import dataParsers
	d250k_sd = dataParsers.parse_binary_snp_data(d250k_file)
	ecotypes = []
	key_file = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/unique_id_to_ecotype_id.csv"
	f = open(key_file, "w")
	f.write("unique_id, accession_name, ecotype_id, in_250k_data\n")
	for acc, acc_id in zip(accession_names, full_accession_names):
		if not acc in acc_dict or acc_id == 'karl27' or acc_id == 'karl05':
			print "(%s, %s) is missing" % (acc, acc_id)
		else:
			ecotype = acc_dict[acc]
			ecotypes.append(ecotype)
			f.write("%s,%s,%s,%s\n" % (acc_id, acc, str(ecotype), str(str(ecotype) in d250k_sd.accessions)))

	#phenotype_names = reader.next()[2:]
	phenotypes = []	#[acc_id][phenotype_name]
	f = open(filename, "r")

	for l in f:
		line = map(str.strip, l.split(','))
		if line[0].lower() in acc_dict:
			phen_vals = []
			for pv in line[-2:]:
				if pv == "NA":
					pv = 'NA'
				else:
					pv = float(pv)
				phen_vals.append(pv)
			phenotypes.append(phen_vals)
		else:
			print "Missing:", line[0]

	phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
	phed.insert_into_DB(growth_condition='Field', biology_category_id='2')
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/nc14_resistance_091610.tsv", delimiter='\t')
Пример #2
0
def load_FLC_phenotypes():
    import csv
    import phenotypeData as pd
    """
	Load the new 01/12/10 phenotype file
	"""
    filename = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/phenotype_data_011710.csv"
    print "Loading phenotype/accession file:", filename
    f = open(filename, "r")
    reader = csv.reader(f)
    phenotype_names = reader.next()[3:]
    #print phenotype_names
    accession_names = []
    accession_ID = []
    flc_id_to_ecotype_map = {}
    phenotypes = [[] for i in range(len(phenotype_names))
                  ]  #[phenotype_name][acc_id]
    for row in reader:
        accession_names.append(row[2].lower())
        accession_ID.append(row[0])
        for i, phen_val in enumerate(row[3:]):
            try:
                p_val = float(phen_val)
            except Exception:
                p_val = "NA"
            #print p_val
            phenotypes[i].append(p_val)
    f.close()
    #print accession_names
    acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)
    #acc_dict["cibc-5"] = 6908
    #acc_dict["pla-0"] = 8357
    #print acc_dict
    #accession_names.sort()
    new_phenotypes = [[] for i in range(len(phenotype_names))]
    ecotypes = []
    for acc in acc_dict:
        acc_i = accession_names.index(acc)
        ecotype = acc_dict[acc]
        flc_id_to_ecotype_map[accession_ID[acc_i]] = ecotype
        ecotypes.append(ecotype)
        for i in range(len(phenotype_names)):
            new_phenotypes[i].append(phenotypes[i][acc_i])
    #print new_phenotypes
    #print len(ecotypes)
    #return {"phenotypes":new_phenotypes,"phenotype_names":phenotype_names, "ecotypes":ecotypes}
    phenotypes = map(list, zip(*new_phenotypes))
    phend = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
    phend.writeToFile("/tmp/FLC_phenotypes_011710.tsv", delimiter="\t")
    flc_id_to_ecotype_map['Col'] = 6909
    flc_id_to_ecotype_map['Lov1'] = 6043
    flc_id_to_ecotype_map['Ull-2-5'] = 6974
    flc_id_to_ecotype_map['Var-2-6-Part'] = 7517
    print flc_id_to_ecotype_map
    return phend, flc_id_to_ecotype_map
Пример #3
0
def load_phentoype_file_nc_resistance_2():
    filename = "/Users/bjarnivilhjalmsson/Projects/Albugo_laibachii_nc14.csv"
    f = open(filename, "r")
    line = map(str.strip, f.next().split(','))
    phenotype_names = line[-1:]
    print phenotype_names
    phenotypes = []
    accession_names = []
    for l in f:
        line = map(str.strip, l.split(','))
        accession_names.append(line[0].lower())
    f.close()
    print accession_names
    acc_dict = pd.get_accession_to_ecotype_id_dict(
        accession_names)  #+["n13","kno-10","kno-10","shahdara","nd-1"])
    #	acc_dict["cibc-5"] = 6908
    #	acc_dict["wa-1"] = 6978
    #	acc_dict["gu-0"] = 7149
    #	acc_dict['Rubezhnoe-1'] = 7323
    print len(acc_dict), acc_dict
    import env
    d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv.binary"
    import dataParsers
    d250k_sd = dataParsers.parse_binary_snp_data(d250k_file)
    ecotypes = []
    for acc in accession_names:
        if not acc in acc_dict:
            print "%s is missing" % (acc)
        else:
            ecotype = acc_dict[acc]
            ecotypes.append(ecotype)

    phenotypes = []  #[acc_id][phenotype_name]
    f = open(filename, "r")

    for l in f:
        line = map(str.strip, l.split(','))
        if line[0].lower() in acc_dict:
            phen_vals = []
            for pv in line[-1:]:
                if pv == "NA":
                    pv = 'NA'
                else:
                    pv = float(pv)
                phen_vals.append(pv)
            phenotypes.append(phen_vals)
        else:
            print "Missing:", line[0]

    phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
    phed.insert_into_DB(growth_condition='Field', biology_category_id='2')
    phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/nc14_resistance_96accessions_092810.tsv", \
      delimiter='\t')
Пример #4
0
def load_phentoype_file(filename):
    """
	Load a FLC type phenotype data file.
	"""
    print "Loading phenotype file:", filename
    f = open(filename, "r")
    reader = csv.reader(f)
    phenotype_names = reader.next()[3:]
    #print phenotype_names
    accession_names = []
    accession_ID = []
    phenotypes = [[] for i in range(len(phenotype_names))
                  ]  #[phenotype_name][acc_id]
    for row in reader:
        accession_names.append(row[2].lower())
        accession_ID.append(row[0])
        for i, phen_val in enumerate(row[3:]):
            try:
                p_val = float(phen_val)
            except Exception:
                p_val = "NA"
            #print p_val
            phenotypes[i].append(p_val)
    f.close()
    #print accession_names
    acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)
    acc_dict["cibc-5"] = 6908
    acc_dict["pla-0"] = 8357
    #print acc_dict
    #accession_names.sort()
    new_phenotypes = [[] for i in range(len(phenotype_names))]
    ecotypes = []
    for acc in acc_dict:
        acc_i = accession_names.index(acc)
        ecotypes.append(acc_dict[acc])
        for i in range(len(phenotype_names)):
            new_phenotypes[i].append(phenotypes[i][acc_i])
    #print new_phenotypes
    #print len(ecotypes)
    #return {"phenotypes":new_phenotypes,"phenotype_names":phenotype_names, "ecotypes":ecotypes}
    phenotypes = map(list, zip(*new_phenotypes))
    phend = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
    phend.writeToFile("/tmp/FLC_phenotypes_102809.tsv", delimiter="\t")
    return phend
Пример #5
0
def _testMargarita_(rho = 1, theta = 1, numIndivid = 10, numDatasets = 100, nseg_sites = 100):
	import os, tempfile
	(fId, tempDataFile) = tempfile.mkstemp()
	os.close(fId)

	#"""
	# Generate data using ms.
	msCommand = "~/Projects/programs/ms/ms " + str(numIndivid) + " " + str(numDatasets) + " -s " + str(nseg_sites) + " -t " + str(theta) + " -r " + str(rho) + " " + str(nseg_sites) + " > " + tempDataFile + ""
	print msCommand
	os.system(msCommand)
		

	import dataParsers

	# Parse datasets
	print "parsing ms dataset"
	snpsds = dataParsers.parseMSFile(tempDataFile)
	#"""

	"""
	snpsds = _generateBinomData_()
	"""
	print len(snpsds)
	accessions = range(0, numIndivid)
	for snpsd in snpsds:
		snpsd.accessions = accessions
	print snpsds[0].accessions

	

	import phenotypeData
	#FIXME phenotypeData
	phenValues = []
	for i in range(0, numIndivid / 2):
		phenValues.append([0])
	for i in range(0, numIndivid - numIndivid / 2):
		phenValues.append([1])
		
	print phenValues
	phed = phenotypeData.PhenotypeData(accessions, ["test_phenotype"], phenValues)
	

	(fId, tempMargFile) = tempfile.mkstemp()
	os.close(fId)
	(fId, tempOutFile) = tempfile.mkstemp()
	os.close(fId)

	marg = Margarita(tempMargFile, tempOutFile, 30, nseg_sites, 20000, 20)

	minPvalList = []
	for snpsd in snpsds:	# for all ms datasets 
		#minPval = 1.0
		#while minPval == 1.0: 
		(a, b, permPvals) = marg.gwa(snpsd, phed, phenotype = 0, binary = True)
		pvals = []
		for pval in permPvals:
			pvals.append(float(pval))
		print pvals
		minPval = min(pvals)
		minPvalList.append(minPval)
	print minPvalList
Пример #6
0
def load_phentoype_file_Pecinka():
	accession_file = "/Users/bjarnivilhjalmsson/Projects/Ales_Pecinka/NatVar-AP-2010-Feb.csv"
	f = open(accession_file, "r")
	reader = csv.reader(f)
	accession_names = []
	accession_ID = []
	for row in reader:
		accession_names.append(row[1].split()[0].lower())
		accession_ID.append("CS" + row[0][1:])
	f.close()
	print accession_names
	acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names + ["n13", "kno-10", "kno-10", "shahdara", "nd-1"])
	acc_dict["cibc-5"] = 6908
	acc_dict["wa-1"] = 6978
	acc_dict["gu-0"] = 6922
	acc_dict["cs22491"] = acc_dict["n13"]
	acc_dict["knox-10"] = acc_dict["kno-10"]
	acc_dict["knox-18"] = acc_dict["kno-10"]
	acc_dict["shakdara"] = acc_dict["shahdara"]
	acc_dict["wd-1"] = acc_dict["nd-1"]
	print acc_dict


	filename = "/Users/bjarnivilhjalmsson/Projects/Ales_Pecinka/NatVar-AP-2010-Feb_phen.csv"
	#phenotype_names = reader.next()[2:]
	phenotype_names = ["Absolute_root_growth", "Absolute_root_growth_sd", "Percentage_of_root_elongation", "Percentage_of_bent roots",
			   "Percentage_of_dead_plants", "Percentage_of_unaffected_plants", "Percentage_of_average_survival"]
	phenotype_indices = [1, 2, 5, 8, 11, 14, 17]
	phenotype_ecotypes = [0, 0, 4, 7, 10, 13, 16]
	print phenotype_names
	ecotype_ids = [[] for i in range(len(phenotype_names))]
	phenotypes = [[] for i in range(len(phenotype_names))]	#[phenotype_name][acc_id]
	f = open(filename, "r")
	reader = csv.reader(f)
	new_ecotype_ids = set()
	for row in reader:
		print row
		for i, (pi, ei) in enumerate(zip(phenotype_indices, phenotype_ecotypes)):
			if row[ei] != "":
				acc_name = (row[ei].split()[0]).lower()
				if acc_name in acc_dict:
					eid = acc_dict[(row[ei].split()[0]).lower()]
					new_ecotype_ids.add(eid)
					pv = float(row[pi])
					ecotype_ids[i].append(eid)
					phenotypes[i].append(pv)
				else:
					print "Wrong accession name?", acc_name


	new_phenotypes = []
	new_ecotype_ids = list(new_ecotype_ids)
	for i, phen_vals in enumerate(phenotypes):
		new_phen_vals = []
		for ei in new_ecotype_ids:
			if ei in ecotype_ids[i]:
				j = ecotype_ids[i].index(ei)
				new_phen_vals.append(phen_vals[j])
			else:
				new_phen_vals.append('NA')
		new_phenotypes.append(new_phen_vals)
	phenotypes = map(list, zip(*new_phenotypes))
	ecotypes = map(str, new_ecotype_ids)
	phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Ales_Pecinka/phen_pecinka_170310.tsv", delimiter='\t')
Пример #7
0
def load_phentoype_file_bergelsson():
	import env
	filename = "/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/bergelsson_rosette_glucs.csv"
	f = open(filename, "r")
	reader = csv.reader(f)
	phenotype_names = reader.next()[2:]
	for i in range(len(phenotype_names)):
		phenotype_names[i] = phenotype_names[i].replace(" ", "_")
		phenotype_names[i] = 'jb_' + phenotype_names[i]
	print phenotype_names
	accession_names = []
	accession_ID = []
	for row in reader:
		accession_names.append(row[0].split()[0].lower())
		accession_ID.append(row[1])
	f.close()
	print accession_names
	#acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"])
	e_info_dict = pd._getEcotypeIdInfoDict_()
	ei_2_tgei = pd._getEcotype2TgEcotypeDict_()
	#print len(acc_dict),acc_dict
	ecotypes = []
        uncertain_list = []
	for acc, acc_id in zip(accession_names, accession_ID):
		#if not acc in acc_dict:
		if not int(acc_id) in ei_2_tgei:
			print "(%s, %s) is missing in dictionary" % (acc, acc_id)
			a_id = int(acc_id)
			if a_id in e_info_dict:
				e_info = e_info_dict[a_id]
				print "Guessing that it's:", e_info
			else:
				print "No good guess for it.  Look it up!!\n"
			#acc_dict[acc] = acc_id
			ecotypes.append(acc_id)
		else:
			#ecotype = acc_dict[acc]
			ecotype = ei_2_tgei[int(acc_id)]
			ecotypes.append(ecotype)
	phenotype_indices = range(2, len(phenotype_names) + 2)
	phenotypes = []	#[acc_id][phenotype_name]
	f = open(filename, "r")
	reader = csv.reader(f)
	reader.next()

	print len(set(accession_ID)), len(set(ecotypes))

	for row in reader:
		#print row
		#if row[0].split()[0].lower() in acc_dict:
			phen_vals = []
			for pv in row[2:]:
				if pv == "" or pv == 'NA':
					pv = 'NA'
				else:
					pv = float(pv)
				phen_vals.append(pv)
			if len(phen_vals) != len(phenotype_names):
				import pdb;
				pdb.set_trace()
			phenotypes.append(phen_vals)
		#else:
		#	print "Missing:",row[0]


	phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.tsv", delimiter='\t')
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.csv", delimiter=',')
Пример #8
0
def load_phentoype_file_wilczek():
	filename = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/PhenotypeDataWilczek.csv"
	f = open(filename, "r")
	reader = csv.reader(f)
	phenotype_names = reader.next()[2:]
	for i in range(len(phenotype_names)):
		phenotype_names[i] = phenotype_names[i].replace(" ", "_")
	print phenotype_names
	accession_names = []
	accession_ID = []
	for row in reader:
		accession_names.append(row[1].split()[0].lower())
		accession_ID.append(row[0])
	f.close()
	print accession_names
	acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"])
	acc_dict["cibc-5"] = 6908
	acc_dict["wa-1"] = 6978
	acc_dict["gu-0"] = 7149
	acc_dict['Rubezhnoe-1'] = 7323
	print len(acc_dict), acc_dict
	import env
	d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv"
	import dataParsers
	d250k_sd = dataParsers.parse_snp_data(d250k_file)
	ecotypes = []
	key_file = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/unique_id_to_ecotype_id.csv"
	f = open(key_file, "w")
	f.write("unique_id, accession_name, ecotype_id, in_250k_data\n")
	for acc, acc_id in zip(accession_names, accession_ID):
		if not acc in acc_dict or acc_id == 'karl27' or acc_id == 'karl05':
			print "(%s, %s) is missing" % (acc, acc_id)
		else:
			ecotype = acc_dict[acc]
			ecotypes.append(ecotype)
			f.write("%s,%s,%s,%s\n" % (acc_id, acc, str(ecotype), str(str(ecotype) in d250k_sd.accessions)))
	f.close()

	#phenotype_names = reader.next()[2:]
	phenotype_indices = range(2, len(phenotype_names) + 2)
	phenotypes = []	#[acc_id][phenotype_name]
	f = open(filename, "r")
	reader = csv.reader(f)
	reader.next()

	for row in reader:
		#print row
		if row[1].split()[0].lower() in acc_dict:
			phen_vals = []
			for pv in row[2:]:
				if pv == "":
					pv = 'NA'
				else:
					pv = float(pv)
				phen_vals.append(pv)
			phenotypes.append(phen_vals)
		else:
			print "Missing:", row[1]

	phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes)
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.tsv", delimiter='\t')
	phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.csv", delimiter=',')