def load_phentoype_file_nc_resistance(): filename = "/Users/bjarnivilhjalmsson/Summary_results_330Arabidopsis_accessions.csv" f = open(filename, "r") line = map(str.strip, f.next().split(',')) phenotype_names = line[-2:] print phenotype_names phenotypes = [] accession_names = [] full_accession_names = [] for l in f: line = map(str.strip, l.split(',')) accession_names.append(line[0].lower()) full_accession_names.append(line[2].lower()) f.close() print accession_names acc_dict = pd.get_accession_to_ecotype_id_dict(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"]) # acc_dict["cibc-5"] = 6908 # acc_dict["wa-1"] = 6978 # acc_dict["gu-0"] = 7149 # acc_dict['Rubezhnoe-1'] = 7323 print len(acc_dict), acc_dict import env d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv.binary" import dataParsers d250k_sd = dataParsers.parse_binary_snp_data(d250k_file) ecotypes = [] key_file = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/unique_id_to_ecotype_id.csv" f = open(key_file, "w") f.write("unique_id, accession_name, ecotype_id, in_250k_data\n") for acc, acc_id in zip(accession_names, full_accession_names): if not acc in acc_dict or acc_id == 'karl27' or acc_id == 'karl05': print "(%s, %s) is missing" % (acc, acc_id) else: ecotype = acc_dict[acc] ecotypes.append(ecotype) f.write("%s,%s,%s,%s\n" % (acc_id, acc, str(ecotype), str(str(ecotype) in d250k_sd.accessions))) #phenotype_names = reader.next()[2:] phenotypes = [] #[acc_id][phenotype_name] f = open(filename, "r") for l in f: line = map(str.strip, l.split(',')) if line[0].lower() in acc_dict: phen_vals = [] for pv in line[-2:]: if pv == "NA": pv = 'NA' else: pv = float(pv) phen_vals.append(pv) phenotypes.append(phen_vals) else: print "Missing:", line[0] phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phed.insert_into_DB(growth_condition='Field', biology_category_id='2') phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/nc14_resistance_091610.tsv", delimiter='\t')
def load_FLC_phenotypes(): import csv import phenotypeData as pd """ Load the new 01/12/10 phenotype file """ filename = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/phenotype_data_011710.csv" print "Loading phenotype/accession file:", filename f = open(filename, "r") reader = csv.reader(f) phenotype_names = reader.next()[3:] #print phenotype_names accession_names = [] accession_ID = [] flc_id_to_ecotype_map = {} phenotypes = [[] for i in range(len(phenotype_names)) ] #[phenotype_name][acc_id] for row in reader: accession_names.append(row[2].lower()) accession_ID.append(row[0]) for i, phen_val in enumerate(row[3:]): try: p_val = float(phen_val) except Exception: p_val = "NA" #print p_val phenotypes[i].append(p_val) f.close() #print accession_names acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names) #acc_dict["cibc-5"] = 6908 #acc_dict["pla-0"] = 8357 #print acc_dict #accession_names.sort() new_phenotypes = [[] for i in range(len(phenotype_names))] ecotypes = [] for acc in acc_dict: acc_i = accession_names.index(acc) ecotype = acc_dict[acc] flc_id_to_ecotype_map[accession_ID[acc_i]] = ecotype ecotypes.append(ecotype) for i in range(len(phenotype_names)): new_phenotypes[i].append(phenotypes[i][acc_i]) #print new_phenotypes #print len(ecotypes) #return {"phenotypes":new_phenotypes,"phenotype_names":phenotype_names, "ecotypes":ecotypes} phenotypes = map(list, zip(*new_phenotypes)) phend = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phend.writeToFile("/tmp/FLC_phenotypes_011710.tsv", delimiter="\t") flc_id_to_ecotype_map['Col'] = 6909 flc_id_to_ecotype_map['Lov1'] = 6043 flc_id_to_ecotype_map['Ull-2-5'] = 6974 flc_id_to_ecotype_map['Var-2-6-Part'] = 7517 print flc_id_to_ecotype_map return phend, flc_id_to_ecotype_map
def load_phentoype_file_nc_resistance_2(): filename = "/Users/bjarnivilhjalmsson/Projects/Albugo_laibachii_nc14.csv" f = open(filename, "r") line = map(str.strip, f.next().split(',')) phenotype_names = line[-1:] print phenotype_names phenotypes = [] accession_names = [] for l in f: line = map(str.strip, l.split(',')) accession_names.append(line[0].lower()) f.close() print accession_names acc_dict = pd.get_accession_to_ecotype_id_dict( accession_names) #+["n13","kno-10","kno-10","shahdara","nd-1"]) # acc_dict["cibc-5"] = 6908 # acc_dict["wa-1"] = 6978 # acc_dict["gu-0"] = 7149 # acc_dict['Rubezhnoe-1'] = 7323 print len(acc_dict), acc_dict import env d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv.binary" import dataParsers d250k_sd = dataParsers.parse_binary_snp_data(d250k_file) ecotypes = [] for acc in accession_names: if not acc in acc_dict: print "%s is missing" % (acc) else: ecotype = acc_dict[acc] ecotypes.append(ecotype) phenotypes = [] #[acc_id][phenotype_name] f = open(filename, "r") for l in f: line = map(str.strip, l.split(',')) if line[0].lower() in acc_dict: phen_vals = [] for pv in line[-1:]: if pv == "NA": pv = 'NA' else: pv = float(pv) phen_vals.append(pv) phenotypes.append(phen_vals) else: print "Missing:", line[0] phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phed.insert_into_DB(growth_condition='Field', biology_category_id='2') phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/nc14_resistance_96accessions_092810.tsv", \ delimiter='\t')
def load_phentoype_file(filename): """ Load a FLC type phenotype data file. """ print "Loading phenotype file:", filename f = open(filename, "r") reader = csv.reader(f) phenotype_names = reader.next()[3:] #print phenotype_names accession_names = [] accession_ID = [] phenotypes = [[] for i in range(len(phenotype_names)) ] #[phenotype_name][acc_id] for row in reader: accession_names.append(row[2].lower()) accession_ID.append(row[0]) for i, phen_val in enumerate(row[3:]): try: p_val = float(phen_val) except Exception: p_val = "NA" #print p_val phenotypes[i].append(p_val) f.close() #print accession_names acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names) acc_dict["cibc-5"] = 6908 acc_dict["pla-0"] = 8357 #print acc_dict #accession_names.sort() new_phenotypes = [[] for i in range(len(phenotype_names))] ecotypes = [] for acc in acc_dict: acc_i = accession_names.index(acc) ecotypes.append(acc_dict[acc]) for i in range(len(phenotype_names)): new_phenotypes[i].append(phenotypes[i][acc_i]) #print new_phenotypes #print len(ecotypes) #return {"phenotypes":new_phenotypes,"phenotype_names":phenotype_names, "ecotypes":ecotypes} phenotypes = map(list, zip(*new_phenotypes)) phend = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phend.writeToFile("/tmp/FLC_phenotypes_102809.tsv", delimiter="\t") return phend
def _testMargarita_(rho = 1, theta = 1, numIndivid = 10, numDatasets = 100, nseg_sites = 100): import os, tempfile (fId, tempDataFile) = tempfile.mkstemp() os.close(fId) #""" # Generate data using ms. msCommand = "~/Projects/programs/ms/ms " + str(numIndivid) + " " + str(numDatasets) + " -s " + str(nseg_sites) + " -t " + str(theta) + " -r " + str(rho) + " " + str(nseg_sites) + " > " + tempDataFile + "" print msCommand os.system(msCommand) import dataParsers # Parse datasets print "parsing ms dataset" snpsds = dataParsers.parseMSFile(tempDataFile) #""" """ snpsds = _generateBinomData_() """ print len(snpsds) accessions = range(0, numIndivid) for snpsd in snpsds: snpsd.accessions = accessions print snpsds[0].accessions import phenotypeData #FIXME phenotypeData phenValues = [] for i in range(0, numIndivid / 2): phenValues.append([0]) for i in range(0, numIndivid - numIndivid / 2): phenValues.append([1]) print phenValues phed = phenotypeData.PhenotypeData(accessions, ["test_phenotype"], phenValues) (fId, tempMargFile) = tempfile.mkstemp() os.close(fId) (fId, tempOutFile) = tempfile.mkstemp() os.close(fId) marg = Margarita(tempMargFile, tempOutFile, 30, nseg_sites, 20000, 20) minPvalList = [] for snpsd in snpsds: # for all ms datasets #minPval = 1.0 #while minPval == 1.0: (a, b, permPvals) = marg.gwa(snpsd, phed, phenotype = 0, binary = True) pvals = [] for pval in permPvals: pvals.append(float(pval)) print pvals minPval = min(pvals) minPvalList.append(minPval) print minPvalList
def load_phentoype_file_Pecinka(): accession_file = "/Users/bjarnivilhjalmsson/Projects/Ales_Pecinka/NatVar-AP-2010-Feb.csv" f = open(accession_file, "r") reader = csv.reader(f) accession_names = [] accession_ID = [] for row in reader: accession_names.append(row[1].split()[0].lower()) accession_ID.append("CS" + row[0][1:]) f.close() print accession_names acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names + ["n13", "kno-10", "kno-10", "shahdara", "nd-1"]) acc_dict["cibc-5"] = 6908 acc_dict["wa-1"] = 6978 acc_dict["gu-0"] = 6922 acc_dict["cs22491"] = acc_dict["n13"] acc_dict["knox-10"] = acc_dict["kno-10"] acc_dict["knox-18"] = acc_dict["kno-10"] acc_dict["shakdara"] = acc_dict["shahdara"] acc_dict["wd-1"] = acc_dict["nd-1"] print acc_dict filename = "/Users/bjarnivilhjalmsson/Projects/Ales_Pecinka/NatVar-AP-2010-Feb_phen.csv" #phenotype_names = reader.next()[2:] phenotype_names = ["Absolute_root_growth", "Absolute_root_growth_sd", "Percentage_of_root_elongation", "Percentage_of_bent roots", "Percentage_of_dead_plants", "Percentage_of_unaffected_plants", "Percentage_of_average_survival"] phenotype_indices = [1, 2, 5, 8, 11, 14, 17] phenotype_ecotypes = [0, 0, 4, 7, 10, 13, 16] print phenotype_names ecotype_ids = [[] for i in range(len(phenotype_names))] phenotypes = [[] for i in range(len(phenotype_names))] #[phenotype_name][acc_id] f = open(filename, "r") reader = csv.reader(f) new_ecotype_ids = set() for row in reader: print row for i, (pi, ei) in enumerate(zip(phenotype_indices, phenotype_ecotypes)): if row[ei] != "": acc_name = (row[ei].split()[0]).lower() if acc_name in acc_dict: eid = acc_dict[(row[ei].split()[0]).lower()] new_ecotype_ids.add(eid) pv = float(row[pi]) ecotype_ids[i].append(eid) phenotypes[i].append(pv) else: print "Wrong accession name?", acc_name new_phenotypes = [] new_ecotype_ids = list(new_ecotype_ids) for i, phen_vals in enumerate(phenotypes): new_phen_vals = [] for ei in new_ecotype_ids: if ei in ecotype_ids[i]: j = ecotype_ids[i].index(ei) new_phen_vals.append(phen_vals[j]) else: new_phen_vals.append('NA') new_phenotypes.append(new_phen_vals) phenotypes = map(list, zip(*new_phenotypes)) ecotypes = map(str, new_ecotype_ids) phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Ales_Pecinka/phen_pecinka_170310.tsv", delimiter='\t')
def load_phentoype_file_bergelsson(): import env filename = "/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/bergelsson_rosette_glucs.csv" f = open(filename, "r") reader = csv.reader(f) phenotype_names = reader.next()[2:] for i in range(len(phenotype_names)): phenotype_names[i] = phenotype_names[i].replace(" ", "_") phenotype_names[i] = 'jb_' + phenotype_names[i] print phenotype_names accession_names = [] accession_ID = [] for row in reader: accession_names.append(row[0].split()[0].lower()) accession_ID.append(row[1]) f.close() print accession_names #acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"]) e_info_dict = pd._getEcotypeIdInfoDict_() ei_2_tgei = pd._getEcotype2TgEcotypeDict_() #print len(acc_dict),acc_dict ecotypes = [] uncertain_list = [] for acc, acc_id in zip(accession_names, accession_ID): #if not acc in acc_dict: if not int(acc_id) in ei_2_tgei: print "(%s, %s) is missing in dictionary" % (acc, acc_id) a_id = int(acc_id) if a_id in e_info_dict: e_info = e_info_dict[a_id] print "Guessing that it's:", e_info else: print "No good guess for it. Look it up!!\n" #acc_dict[acc] = acc_id ecotypes.append(acc_id) else: #ecotype = acc_dict[acc] ecotype = ei_2_tgei[int(acc_id)] ecotypes.append(ecotype) phenotype_indices = range(2, len(phenotype_names) + 2) phenotypes = [] #[acc_id][phenotype_name] f = open(filename, "r") reader = csv.reader(f) reader.next() print len(set(accession_ID)), len(set(ecotypes)) for row in reader: #print row #if row[0].split()[0].lower() in acc_dict: phen_vals = [] for pv in row[2:]: if pv == "" or pv == 'NA': pv = 'NA' else: pv = float(pv) phen_vals.append(pv) if len(phen_vals) != len(phenotype_names): import pdb; pdb.set_trace() phenotypes.append(phen_vals) #else: # print "Missing:",row[0] phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.tsv", delimiter='\t') phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.csv", delimiter=',')
def load_phentoype_file_wilczek(): filename = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/PhenotypeDataWilczek.csv" f = open(filename, "r") reader = csv.reader(f) phenotype_names = reader.next()[2:] for i in range(len(phenotype_names)): phenotype_names[i] = phenotype_names[i].replace(" ", "_") print phenotype_names accession_names = [] accession_ID = [] for row in reader: accession_names.append(row[1].split()[0].lower()) accession_ID.append(row[0]) f.close() print accession_names acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"]) acc_dict["cibc-5"] = 6908 acc_dict["wa-1"] = 6978 acc_dict["gu-0"] = 7149 acc_dict['Rubezhnoe-1'] = 7323 print len(acc_dict), acc_dict import env d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv" import dataParsers d250k_sd = dataParsers.parse_snp_data(d250k_file) ecotypes = [] key_file = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/unique_id_to_ecotype_id.csv" f = open(key_file, "w") f.write("unique_id, accession_name, ecotype_id, in_250k_data\n") for acc, acc_id in zip(accession_names, accession_ID): if not acc in acc_dict or acc_id == 'karl27' or acc_id == 'karl05': print "(%s, %s) is missing" % (acc, acc_id) else: ecotype = acc_dict[acc] ecotypes.append(ecotype) f.write("%s,%s,%s,%s\n" % (acc_id, acc, str(ecotype), str(str(ecotype) in d250k_sd.accessions))) f.close() #phenotype_names = reader.next()[2:] phenotype_indices = range(2, len(phenotype_names) + 2) phenotypes = [] #[acc_id][phenotype_name] f = open(filename, "r") reader = csv.reader(f) reader.next() for row in reader: #print row if row[1].split()[0].lower() in acc_dict: phen_vals = [] for pv in row[2:]: if pv == "": pv = 'NA' else: pv = float(pv) phen_vals.append(pv) phenotypes.append(phen_vals) else: print "Missing:", row[1] phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.tsv", delimiter='\t') phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.csv", delimiter=',')