def getPhenotypes(host="papaya.usc.edu", user=None, passwd=None, onlyBinary=False, onlyQuantitative=False, onlyCategorical=False, onlyReplicates=False, includeSD=False, rawPhenotypes=False, onlyPublishable=False): print "onlyPublishable:",onlyPublishable import dataParsers e2a = dataParsers.getEcotypeToAccessionDictionary(host,user=user,passwd=passwd,defaultValue='100') import MySQLdb print "Connecting to db, host="+host try: conn = MySQLdb.connect (host = host, user = user, passwd = passwd, db = "at") except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit (1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["hostname=", "user="******"passwd=", "version=", "delim=", "missingval=", "accname", "debug", "report", "help", "only96"] try: opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:v:d:m:abrh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) hostname = 'papaya.usc.edu' user = None passwd = None output_fname = None version = "3" delim = "," missingVal = "NA" useAccessionName = False debug = None report = None help = 0 only96 = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-z", "--hostname"): hostname = arg elif opt in ("-u", "--user"): user = arg elif opt in ("-p", "--passwd"): passwd = arg elif opt in ("-o",): output_fname = arg elif opt in ("-v","--version"): version = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-a","--accname"): useAccessionName = True elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 elif opt in ("--only96"): only96 = True if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) import dataParsers snpsds = dataParsers.get2010DataFromDb(host=hostname,chromosomes=[1,2,3,4,5], dataVersion=version, only96accessions=only96, user=user, passwd=passwd) accDecoder=None if useAccessionName: tmpDecoder = dataParsers.getEcotypeToAccessionDictionary(user="******",passwd="bamboo123") accDecoder={} for acc in tmpDecoder: accDecoder[acc]=tmpDecoder[acc][1] import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", "help", "heterozygous2NA"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 removeEcotypes = None removeArray = None removeIdentical = False onlyCommon = False debug = None report = None help = 0 withArrayIds = 1 first96 = False heterozygous2NA = False for opt, arg in opts: if opt in ('-o'): output_fname = arg elif opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("--removeEcotypeId"): removeEcotypes = arg.split(",") removeEcotypes = map(int,removeEcotypes) elif opt in ("--removeArrayId"): removeArray = int(arg) elif opt in ("--removeIdentical"): removeIdentical = True elif opt in ("--onlyCommon"): onlyCommon = True elif opt in ("--first96"): first96 = True elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 or withArrayIds==3 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) accessionsToRemove = [] arraysToRemove = None if first96: import dataParsers d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',user="******",passwd="bamboo123") ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',user="******",passwd="bamboo123") print "Dictionaries loaded" names = [] first96Names = [] for i in range(0,len(snpsds[0].accessions)): ecotype = snpsds[0].accessions[i] arrayID = snpsds[0].arrayIds[i] names.append((arrayID,ecotd[ecotype],ecotype)) if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0: accessionsToRemove.append(ecotype) else: first96Names.append((arrayID,d[ecotype][1],d[ecotype][0],ecotype)) first96Names.sort() print "First 96 accessions, len:",len(first96Names),":" for name in first96Names: print name names.sort() print "All accessions:" for name in names: print name #Retrieve comparison list of accessions. (Error rates for accessions) if (removeIdentical or maxError<1.0) and comparisonFile: sys.stderr.write("Loading comparison file:") snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) res = [] sys.stderr.write("Comparing accessions.") for i in range(0,len(snpsds)): res.append(snpsds[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,verbose=False,heterozygous2NA=heterozygous2NA)) sys.stderr.write(".") sys.stderr.write("\n") totalAccessionCounts = [0]*len(res[0][2]) accErrorRate = [0]*len(res[0][2]) for i in range(0,len(snpsds)): r = res[i] for j in range(0,len(r[2])): totalAccessionCounts[j] += r[6][j] accErrorRate[j]+=r[3][j]*float(r[6][j]) for i in range(0,len(accErrorRate)): accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i]) accErrAndID = [] if 0<withArrayIds<3: for i in range(0,len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i], r[5][i])) else: for i in range(0,len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i])) accErrAndID.sort() accErrAndID.reverse() #Figure out which accessions are too erroraneous if maxError<1.0 and comparisonFile: if withArrayIds: arraysToRemove = [] for (error,ecotype,array) in accErrAndID: if error> maxError: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for (error,ecotype) in accErrAndID: if error> maxError: accessionsToRemove.append(ecotype) if removeIdentical and comparisonFile and withArrayIds: print "Locating identical accessions" accErrAndID.sort() if not arraysToRemove: arraysToRemove = [] for accession in set(snpsds[0].accessions): if snpsds[0].accessions.count(accession)>1: found = 0 for (error,ecotype,array) in accErrAndID: if ecotype==accession: if found>0: accessionsToRemove.append(ecotype) arraysToRemove.append(array) found += 1 if onlyCommon and comparisonFile: print "Locating accessions which are not shared" snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions))) if not arraysToRemove: arraysToRemove = [] for i in range(0,len(snpsds[0].accessions)): acc = snpsds[0].accessions[i] if not acc in snpsds2[0].accessions: accessionsToRemove.append(acc) if 0<withArrayIds<3: arraysToRemove.append(snpsds[0].arrayIds[i]) if maxMissing<1.0: missingCounts = [0]*len(snpsds[0].accessions) numSnps = 0 for snpsd in snpsds: mc = snpsd.accessionsMissingCounts() numSnps += len(snpsd.positions) for i in range(0,len(snpsds[0].accessions)): missingCounts[i] += mc[i] missingRates = [] if withArrayIds: arraysToRemove = [] for i in range(0,len(snpsds[0].accessions)): missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i],snpsds[0].arrayIds[i])) missingRates.sort() missingRates.reverse() for (mrate,ecotype,array) in missingRates: if mrate>maxMissing: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for i in range(0,len(snpsds[0].accessions)): missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i])) missingRates.sort() missingRates.reverse() for (mrate,ecotype) in missingRates: if mrate>maxMissing: accessionsToRemove.append(ecotype) if removeEcotypes: for removeEcotype in removeEcotypes: accessionsToRemove.append(str(int(removeEcotype))) print "Removing", len(accessionsToRemove), "accessions." if removeArray: if not arraysToRemove: arraysToRemove = [] arraysToRemove.append(str(removeArray)) print "Removing", len(arraysToRemove)," arrays." numAccessions = len(snpsds[0].accessions) sys.stderr.write("Removing accessions.") for snpsd in snpsds: snpsd.removeAccessions(accessionsToRemove,arrayIds=arraysToRemove) sys.stderr.write(".") print "\n", (numAccessions-len(snpsds[0].accessions)), "accessions out of "+str(numAccessions)+" were removed." import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", "help", "heterozygous2NA", "first192", "removeLer", "removeCol" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 removeEcotypes = None removeArray = None removeIdentical = False onlyCommon = False debug = None report = None help = 0 withArrayIds = 1 first96 = False first192 = False heterozygous2NA = False removeLer = False removeCol = False for opt, arg in opts: if opt in ('-o'): output_fname = arg elif opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("--removeEcotypeId"): removeEcotypes = arg.split(",") removeEcotypes = map(int, removeEcotypes) elif opt in ("--removeArrayId"): removeArray = int(arg) elif opt in ("--removeIdentical"): removeIdentical = True elif opt in ("--onlyCommon"): onlyCommon = True elif opt in ("--first96"): first96 = True elif opt in ("--first192"): first192 = True elif opt in ("--removeLer"): removeLer = True elif opt in ("--removeCol"): removeCol = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds == 1 or withArrayIds == 2 waid2 = withArrayIds == 2 or withArrayIds == 3 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) accessionsToRemove = [] arraysToRemove = None if first96: import dataParsers d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1', user="******", passwd="bamboo123") ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1', user="******", passwd="bamboo123") print "Dictionaries loaded" names = [] first96Names = [] for i in range(0, len(snpsds[0].accessions)): ecotype = snpsds[0].accessions[i] arrayID = snpsds[0].arrayIds[i] names.append((arrayID, ecotd[ecotype], ecotype)) if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0: accessionsToRemove.append(ecotype) else: first96Names.append( (arrayID, d[ecotype][1], d[ecotype][0], ecotype)) first96Names.sort() print "First 96 accessions, len:", len(first96Names), ":" for name in first96Names: print name names.sort() print "All accessions:" for name in names: print name elif first192: import phenotypeData ecotypes_192 = map(str, phenotypeData._getFirst192Ecotypes_()) print ecotypes_192, snpsds[0].accessions for acc in snpsds[0].accessions: if acc not in ecotypes_192: accessionsToRemove.append(acc) print "found", len(ecotypes_192), '"192" ecotypes... removing', len( accessionsToRemove), "ecotypes." if removeLer: import analyzeHaplotype as ah accessionsToRemove += ah.getLerAndColAccessions(snpsds)[0] if removeCol: import analyzeHaplotype as ah accessionsToRemove += ah.getLerAndColAccessions(snpsds)[1] #Retrieve comparison list of accessions. (Error rates for accessions) if (removeIdentical or maxError < 1.0) and comparisonFile: sys.stderr.write("Loading comparison file:") snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) res = [] sys.stderr.write("Comparing accessions.") for i in range(0, len(snpsds)): res.append(snpsds[i].compareWith(snpsds2[i], withArrayIds=withArrayIds, verbose=False, heterozygous2NA=heterozygous2NA)) sys.stderr.write(".") sys.stderr.write("\n") totalAccessionCounts = [0] * len(res[0][2]) accErrorRate = [0] * len(res[0][2]) for i in range(0, len(snpsds)): r = res[i] for j in range(0, len(r[2])): totalAccessionCounts[j] += r[6][j] accErrorRate[j] += r[3][j] * float(r[6][j]) for i in range(0, len(accErrorRate)): accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i]) accErrAndID = [] if 0 < withArrayIds < 3: for i in range(0, len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i], r[5][i])) else: for i in range(0, len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i])) accErrAndID.sort() accErrAndID.reverse() #Figure out which accessions are too erroraneous if maxError < 1.0 and comparisonFile: if withArrayIds: arraysToRemove = [] for (error, ecotype, array) in accErrAndID: if error > maxError: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for (error, ecotype) in accErrAndID: if error > maxError: accessionsToRemove.append(ecotype) if removeIdentical and comparisonFile and withArrayIds: print "Locating identical accessions" accErrAndID.sort() if not arraysToRemove: arraysToRemove = [] for accession in set(snpsds[0].accessions): if snpsds[0].accessions.count(accession) > 1: found = 0 for (error, ecotype, array) in accErrAndID: if ecotype == accession: if found > 0: accessionsToRemove.append(ecotype) arraysToRemove.append(array) found += 1 if onlyCommon and comparisonFile: print "Locating accessions which are not shared" snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions))) if not arraysToRemove: arraysToRemove = [] for i in range(0, len(snpsds[0].accessions)): acc = snpsds[0].accessions[i] if not acc in snpsds2[0].accessions: accessionsToRemove.append(acc) if 0 < withArrayIds < 3: arraysToRemove.append(snpsds[0].arrayIds[i]) if maxMissing < 1.0: missingCounts = [0] * len(snpsds[0].accessions) numSnps = 0 for snpsd in snpsds: mc = snpsd.accessionsMissingCounts() numSnps += len(snpsd.positions) for i in range(0, len(snpsds[0].accessions)): missingCounts[i] += mc[i] missingRates = [] if withArrayIds: arraysToRemove = [] for i in range(0, len(snpsds[0].accessions)): missingRates.append( (missingCounts[i] / float(numSnps), snpsds[0].accessions[i], snpsds[0].arrayIds[i])) missingRates.sort() missingRates.reverse() for (mrate, ecotype, array) in missingRates: if mrate > maxMissing: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for i in range(0, len(snpsds[0].accessions)): missingRates.append((missingCounts[i] / float(numSnps), snpsds[0].accessions[i])) missingRates.sort() missingRates.reverse() for (mrate, ecotype) in missingRates: if mrate > maxMissing: accessionsToRemove.append(ecotype) if removeEcotypes: for removeEcotype in removeEcotypes: accessionsToRemove.append(str(int(removeEcotype))) print "Removing", len(accessionsToRemove), "accessions." if removeArray: if not arraysToRemove: arraysToRemove = [] arraysToRemove.append(str(removeArray)) print "Removing", len(arraysToRemove), " arrays." numAccessions = len(snpsds[0].accessions) sys.stderr.write("Removing accessions.") for snpsd in snpsds: snpsd.removeAccessions(accessionsToRemove, arrayIds=arraysToRemove) sys.stderr.write(".") print "\n", ( numAccessions - len(snpsds[0].accessions) ), "accessions out of " + str(numAccessions) + " were removed." import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname, snpsds, chromosomes=[1, 2, 3, 4, 5], deliminator=delim, missingVal=missingVal, withArrayIds=waid1)