def run(self):
        """
		2008-5-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        #database connection and etc
        db = self.db_250k

        session = db.session
        session.begin()

        delimiter = figureOutDelimiter(self.inputFname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(self.inputFname, delimiter=delimiter,\
                       matrix_data_type=int)

        if self.snp_id_type == 1:
            #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs.
            # but if col-id is already chr_pos, it's fine.
            new_header = header[:2]
            data_matrix_col_index_to_be_kept = []
            for i in range(2, len(header)):
                snp_id = header[i]
                chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, )
                if chr_pos is not None:
                    data_matrix_col_index_to_be_kept.append(i - 2)
                    new_header.append(chr_pos)
            # to remove no-db_id columns from data matrix
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept]
            header = new_header

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData,
                                                need_transposeSNPData=1,
                                                report=self.report)
        chromosomes = [
            rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls
        ]
        snpsdata.writeRawSnpsDatasToFile(self.outputFname,
                                         rawSnpsData_ls,
                                         chromosomes=chromosomes,
                                         deliminator=',',
                                         withArrayIds=self.array_id_2nd_column)
예제 #2
0
	def run(self):
		"""
		2008-5-12
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		if self.array_id_2nd_column:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		else:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
							data_matrix=data_matrix)	#ignore category_list
		
		rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report)
		chromosomes = [rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls]
		snpsdata.writeRawSnpsDatasToFile(self.output_fname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
예제 #3
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["hostname=", "user="******"passwd=", "version=", "delim=", "missingval=", "accname", "debug", "report", "help", "only96"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:v:d:m:abrh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	hostname = 'papaya.usc.edu'
	user = None
	passwd = None
	output_fname = None
	version = "3"
	delim = ","
	missingVal = "NA"
	useAccessionName = False
	debug = None
	report = None
	help = 0
	only96 = False
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-z", "--hostname"):
			hostname = arg
		elif opt in ("-u", "--user"):
			user = arg
		elif opt in ("-p", "--passwd"):
			passwd = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-v","--version"):
			version = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-a","--accname"):
			useAccessionName = True		
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		elif opt in ("--only96"):
			only96 = True
	

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)


	import dataParsers
	snpsds = dataParsers.get2010DataFromDb(host=hostname,chromosomes=[1,2,3,4,5], dataVersion=version, only96accessions=only96, user=user, passwd=passwd)
	
	accDecoder=None
	if useAccessionName:
		tmpDecoder = dataParsers.getEcotypeToAccessionDictionary(user="******",passwd="bamboo123")
		accDecoder={}
		for acc in tmpDecoder:
			accDecoder[acc]=tmpDecoder[acc][1]
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
예제 #4
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["hostname=", "user="******"passwd=", "version=", "delim=", "missingval=", "accname", "debug", "report", "help", "only96"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:v:d:m:abrh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	hostname = 'papaya.usc.edu'
	user = None
	passwd = None
	output_fname = None
	version = "3"
	delim = ","
	missingVal = "NA"
	useAccessionName = False
	debug = None
	report = None
	help = 0
	only96 = False
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-z", "--hostname"):
			hostname = arg
		elif opt in ("-u", "--user"):
			user = arg
		elif opt in ("-p", "--passwd"):
			passwd = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-v","--version"):
			version = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-a","--accname"):
			useAccessionName = True		
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		elif opt in ("--only96"):
			only96 = True
	

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)


	import dataParsers
	snpsds = dataParsers.get2010DataFromDb(host=hostname,chromosomes=[1,2,3,4,5], dataVersion=version, only96accessions=only96, user=user, passwd=passwd)
	
	accDecoder=None
	if useAccessionName:
		tmpDecoder = dataParsers.getEcotypeToAccessionDictionary(user="******",passwd="bamboo123")
		accDecoder={}
		for acc in tmpDecoder:
			accDecoder[acc]=tmpDecoder[acc][1]
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
예제 #5
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["newBatch","hostname=", "user="******"passwd=", "method=", "delim=", "missingval=", "withArrayId=", "callProbFile=", "help"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:t:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	hostname = 'papaya.usc.edu'
	user = None
	passwd = None
	output_fname = None
	method = 1 
	delim = ","
	missingVal = "NA"
	help = 0
	withArrayId = False
	callProbFile = None
	newBatch = False

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-z", "--hostname"):
			hostname = arg
		elif opt in ("-u", "--user"):
			user = arg
		elif opt in ("-p", "--passwd"):
			passwd = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-t","--method"):
			method = int(arg)
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg	
		elif opt in ("-a","--withArrayId"):
			withArrayId = bool(arg)
		elif opt in ("--callProbFile"):
			callProbFile =arg
		elif opt in ("--newBatch"):
			newBatch = True
	
	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)


	import dataParsers
	import snpsdata
	if callProbFile:
		snpsds = dataParsers.get250KDataFromDb(host=hostname,chromosomes=[1,2,3,4,5], methodId=method, user=user, passwd=passwd, withArrayIds=withArrayId, callProb=True, newBatch=newBatch)
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds=withArrayId, callProbFile=callProbFile)
	else:
		snpsds = dataParsers.get250KDataFromDb(host=hostname,chromosomes=[1,2,3,4,5], methodId=method, user=user, passwd=passwd, withArrayIds=withArrayId, newBatch=newBatch)
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds=withArrayId)
예제 #6
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["hostname=", "user="******"passwd=", "delim=", "missingval=", "accname", "debug", "report", "help"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:d:m:abrh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	hostname = 'papaya.usc.edu'
	user = None
	passwd = None
	output_fname = None
	delim = ", "
	missingVal = "NA"
	useAccessionName = False
	debug = None
	report = None
	help = 0
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-z", "--hostname"):
			hostname = arg
		elif opt in ("-u", "--user"):
			user = arg
		elif opt in ("-p", "--passwd"):
			passwd = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-a","--accname"):
			useAccessionName = True		
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
	

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	import dataParsers
	snpsds = dataParsers.getPerlgenDataFromDb(host=hostname,chromosomes=[1,2,3,4,5], user=user, passwd=passwd)
	
	accDecoder=None
	if useAccessionName:
		accDecoder = dataParsers.ecotypeId2Name
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
예제 #7
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["priority=", "delim=", "missingval=", "union=", "intersection=", "debug", "report", "help", "withArrayId="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:p:d:m:u:i:a:brh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	if len(args)!=2:
		raise Exception("Number of arguments isn't correct.")
	inputFile1 = args[0]
	inputFile2 = args[1]
	priority = 1
	union = 0
	intersection = 0
	output_fname = None
	delim = ","
	missingVal = "NA"
	debug = None
	report = None
	withArrayIds = 0
	chromosomes = [1,2,3,4,5]
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-p", "--priority"):
			priority = int(arg)
		elif opt in ("-u", "--union"):
			union = int(arg)
		elif opt in ("-i", "--intersection"):
			intersection = int(arg)
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg	
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

		
	if not output_fname:
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)
		
	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	import dataParsers
	(snpsds1,chromosomes1) = dataParsers.parseCSVData(inputFile1, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1,returnChromosomes=True)
	(snpsds2,chromosomes2) = dataParsers.parseCSVData(inputFile2, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2,returnChromosomes=True)
	withArrayIds = waid1
	
	
	if len(snpsds1) != len(snpsds2):
		print("Warning: Unequal number of chromosomes.")
		#raise Exception("Unequal number of chromosomes.")
		
	import snpsdata
	if union==0 and intersection==0:
		for i in range(0,len(chromosomes1)):
			chr1 = chromosomes1[i]
			for j in range(0,len(chromosomes2)):
				chr2 = chromosomes2[j]
				if chr1==chr2:
					snpsds1[i].mergeData(snpsds2[j],priority=priority)
		chromosomes = chromosomes1
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
	elif 0<union<4 and intersection==0:
		for i in range(0,len(chromosomes1)):
			chr1 = chromosomes1[i]
			for j in range(0,len(chromosomes2)):
				chr2 = chromosomes2[j]
				if chr1==chr2:
					snpsds1[i].mergeDataUnion(snpsds2[j], priority=priority, unionType=union)
		if union==1 or union==3:			
			chromosomes = set(chromosomes1).union(set(chromosomes2))
			chromosomes = list(chromosomes)
			chromosomes.sort()
		elif union==2:
			chromosomes = chromosomes1
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal)
	elif 0<intersection<4 and union==0:
		for i in range(0,len(snpsds1)):
			snpsds1[i].mergeDataIntersection(snpsds2[i], priority=priority, intersectionType=intersection)
		if intersection==1 or intersection==3:
			chromosomes = set(chromosomes1).intersection(set(chromosomes2))
			chromosomes = list(chromosomes)
			chromosomes.sort()
		elif intersection==2:
			chromosomes = chromosomes1
			
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal)
	else:
		if help==0:
			print "The union or intersection options used are wrong!!\n"
			print __doc__
			sys.exit(2)
예제 #8
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "hostname=", "user="******"passwd=", "delim=", "missingval=", "accname",
        "debug", "report", "help"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:d:m:abrh",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    hostname = 'papaya.usc.edu'
    user = None
    passwd = None
    output_fname = None
    delim = ", "
    missingVal = "NA"
    useAccessionName = False
    debug = None
    report = None
    help = 0

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-z", "--hostname"):
            hostname = arg
        elif opt in ("-u", "--user"):
            user = arg
        elif opt in ("-p", "--passwd"):
            passwd = arg
        elif opt in ("-o", ):
            output_fname = arg
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-a", "--accname"):
            useAccessionName = True
        elif opt in ("-b", "--debug"):
            debug = 1
        elif opt in ("-r", "--report"):
            report = 1

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    import dataParsers
    snpsds = dataParsers.getPerlgenDataFromDb(host=hostname,
                                              chromosomes=[1, 2, 3, 4, 5],
                                              user=user,
                                              passwd=passwd)

    accDecoder = None
    if useAccessionName:
        accDecoder = dataParsers.ecotypeId2Name
    import snpsdata
    snpsdata.writeRawSnpsDatasToFile(output_fname,
                                     snpsds,
                                     chromosomes=[1, 2, 3, 4, 5],
                                     deliminator=delim,
                                     missingVal=missingVal,
                                     accDecoder=accDecoder)
예제 #9
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "monomorphic", "onlyBinary", "delim=", 
						"missingval=", "withArrayId=", "callProbFile=", "minMAF=", "minCallProb=", "debug", 
						"report", "help", "output01Format", "filterRegion="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	inputFile = args[0]
	output_fname = None
	delim = ","
	missingVal = "NA"
	comparisonFile = None
	maxMissing = 1.0
	maxError = 1.0
	monomorphic = False
	debug = None
	report = None
	help = 0
	withArrayIds = 0
	minCallProb=None
	minMAF=None
	callProbFile = None
	onlyBinary = False
	output01Format = False
	filterRegion = False
	startPos = None
	endPos = None
	chromosome = None
	chromosomes=[1,2,3,4,5]
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("--comparisonFile"):
			comparisonFile = arg
		elif opt in ("--maxError"):
			maxError = float(arg)
		elif opt in ("--maxMissing"):
			maxMissing = float(arg)
		elif opt in ("--minCallProb"):
			minCallProb = float(arg)
		elif opt in ("--minMAF"):
			minMAF = float(arg)
		elif opt in ("--callProbFile"):
			callProbFile = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		elif opt in ("--monomorphic"):
			monomorphic = True
		elif opt in ("--onlyBinary"):
			onlyBinary = True
		elif opt in ("--output01Format"):
			output01Format = True
		elif opt in ("--filterRegion"):
			filterRegion = True
			region = arg.split(",")
			region = map(int,region)
			chromosome = region[0]
			startPos = region[1]
			endPos = region[2]
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	if callProbFile and minCallProb:
		#Read prob file into SNPsdatas.
		#snpsds = dataParsers.parseCSVDataWithCallProb(inputFile, callProbFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
		pass
	else:
		snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	
        #Filtering monomorphic
	if monomorphic:
		print "Filtering monomorphic SNPs"
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	if onlyBinary or output01Format:
		print "Filtering non-binary SNPs"
		for snpsd in snpsds:
			print "Removed", str(snpsd.onlyBinarySnps()),"Snps"

	#Filtering missing values
	if maxMissing<1.0 and maxMissing>=0.0:
		print "Filtering SNPs with missing values"
		numAccessions = len(snpsds[0].accessions)
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMissingSnps(int(maxMissing*numAccessions))),"Snps"

	#Filtering bad SNPs
	if comparisonFile and maxError<1.0:
		print "Filtering erroneous SNPs, with maxError=",maxError
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		for i in range(0,len(snpsds)):
			snpsds[i].filterBadSnps(snpsds2[i],maxError)
			

	if minMAF:
		print "Removing SNPs withe MAF <",minMAF
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMinMAF(minMAF)),"Snps"

	#Output specific region..
	if filterRegion:
		chromosomes = [chromosome]
		snpsd = snpsds[chromosome-1]
		snpsd.filterRegion(startPos,endPos)
		snpsds = [snpsd]
		
		
	#Converting lousy calls to NAs
	if callProbFile and minCallProb:
		print "Converting base calls with call prob. lower than",minCallProb,"to NAs"
		#To avoid memory problems, the file/data is processed one line at a time.
		gInFile = open(inputFile,"r")
		pInFile = open(callProbFile,"r")
		outFile = open(output_fname,"w")
		if withArrayIds==2:
			gline = gInFile.readline()
			outFile.write(gline)
			pInFile.readline()
		gline = gInFile.readline()
		outFile.write(gline)
		pInFile.readline()
		i = 0
		totalCount = 0.0
		convertedCount = 0.0 
		
		while(1):
			i += 1
			gline = gInFile.readline()
			pline = pInFile.readline()
			#print gline
			if gline and pline:
				snp = gline.strip().split(delim) 
				probs = pline.strip().split(delim)
				probs = map(float,probs)
				newSNP = []
				totalCount += len(snp)
				for (nt,prob) in zip(snp,probs):
					if prob>minCallProb:
						newSNP.append(nt)
						convertedCount += 1.0
					else:
						newSNP.append('NA')
				outFile.write(delim.join(newSNP)+"\n")
			else:
				print i,gline,pline		
				break
			
			if i%10000==0:
				print i
		print i
		gInFile.close()
		pInFile.close()
		outFile.close()		
		print "Fraction converted =",convertedCount/totalCount
		
	else:
		if output01Format:
			snpsds01format = []
			for snpsd in snpsds:
				snpsds01format.append(snpsd.getSnpsData(missingVal=missingVal))
			#FINISH
			snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds01format,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
		else:
			snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
예제 #10
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", 
			     "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", 
			     "help", "heterozygous2NA"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	inputFile = args[0]
	output_fname = None
	delim = ","
	missingVal = "NA"
	comparisonFile = None
	maxMissing = 1.0
	maxError = 1.0
	removeEcotypes = None
	removeArray = None
	removeIdentical = False
	onlyCommon = False
	debug = None
	report = None
	help = 0
	withArrayIds = 1
	first96 = False
	heterozygous2NA = False
	
	for opt, arg in opts:
		if opt in ('-o'):
			output_fname = arg
		elif opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("--comparisonFile"):
			comparisonFile = arg
		elif opt in ("--maxError"):
			maxError = float(arg)
		elif opt in ("--maxMissing"):
			maxMissing = float(arg)
		elif opt in ("--heterozygous2NA"):
			heterozygous2NA = True
		elif opt in ("--removeEcotypeId"):
			removeEcotypes = arg.split(",")
			removeEcotypes = map(int,removeEcotypes)
		elif opt in ("--removeArrayId"):
			removeArray = int(arg)
		elif opt in ("--removeIdentical"):
			removeIdentical = True
		elif opt in ("--onlyCommon"):
			onlyCommon = True
		elif opt in ("--first96"):
			first96 = True
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)


	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2 or withArrayIds==3

	import dataParsers
	snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	
	accessionsToRemove = []
	arraysToRemove = None

	if first96:
		import dataParsers
		d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',user="******",passwd="bamboo123")
		ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',user="******",passwd="bamboo123")
		print "Dictionaries loaded"
		names = []
		first96Names = []
		for i in range(0,len(snpsds[0].accessions)):
			ecotype = snpsds[0].accessions[i]
			arrayID = snpsds[0].arrayIds[i]
			names.append((arrayID,ecotd[ecotype],ecotype))
			if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0:
				accessionsToRemove.append(ecotype)
			else:
				first96Names.append((arrayID,d[ecotype][1],d[ecotype][0],ecotype))

		first96Names.sort()
		print "First 96 accessions, len:",len(first96Names),":"
		for name in first96Names:
			print name
		names.sort()
		print "All accessions:"
		for name in names:
			print name


	#Retrieve comparison list of accessions.  (Error rates for accessions)
	if (removeIdentical or maxError<1.0) and comparisonFile:
		sys.stderr.write("Loading comparison file:")
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		res = []
		sys.stderr.write("Comparing accessions.")
		for i in range(0,len(snpsds)):
			res.append(snpsds[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,verbose=False,heterozygous2NA=heterozygous2NA))
			sys.stderr.write(".")
		sys.stderr.write("\n")

		totalAccessionCounts = [0]*len(res[0][2])
		accErrorRate = [0]*len(res[0][2])
		for i in range(0,len(snpsds)):
			r = res[i]
			for j in range(0,len(r[2])):
				totalAccessionCounts[j] += r[6][j]
				accErrorRate[j]+=r[3][j]*float(r[6][j])
		
		for i in range(0,len(accErrorRate)):
			accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i])

		accErrAndID = []
		if 0<withArrayIds<3:
			for i in range(0,len(r[2])):
				accErrAndID.append((accErrorRate[i], r[2][i], r[5][i]))
		else:
			for i in range(0,len(r[2])):
				accErrAndID.append((accErrorRate[i], r[2][i]))
		accErrAndID.sort()
		accErrAndID.reverse()


   
	#Figure out which accessions are too erroraneous
	if maxError<1.0 and comparisonFile:
		if withArrayIds:
			arraysToRemove = []
			for (error,ecotype,array) in accErrAndID:
				if error> maxError:
					accessionsToRemove.append(ecotype)
					arraysToRemove.append(array)

		else:
			for (error,ecotype) in accErrAndID:
				if error> maxError:
					accessionsToRemove.append(ecotype)


	if removeIdentical and comparisonFile and withArrayIds:
		print "Locating identical accessions"
		accErrAndID.sort()
		if not arraysToRemove:
			arraysToRemove = []
		for accession in set(snpsds[0].accessions):
			if snpsds[0].accessions.count(accession)>1:
				found = 0
				for (error,ecotype,array) in accErrAndID:
					if ecotype==accession:
						if found>0:
							accessionsToRemove.append(ecotype)
							arraysToRemove.append(array)
						found += 1

	if onlyCommon and comparisonFile:
		print "Locating accessions which are not shared"
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		#print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions)))
		if not arraysToRemove:
			arraysToRemove = []
		for i in range(0,len(snpsds[0].accessions)):
			acc = snpsds[0].accessions[i]
			if not acc in snpsds2[0].accessions:
				accessionsToRemove.append(acc)
				if 0<withArrayIds<3:
					arraysToRemove.append(snpsds[0].arrayIds[i])


	if maxMissing<1.0:
		missingCounts = [0]*len(snpsds[0].accessions)
		numSnps = 0
		for snpsd in snpsds:
			mc = snpsd.accessionsMissingCounts()
			numSnps += len(snpsd.positions)
			for i in range(0,len(snpsds[0].accessions)):
				missingCounts[i] += mc[i]
		
		missingRates = []		
		if withArrayIds:
			arraysToRemove = []
			for i in range(0,len(snpsds[0].accessions)):
				missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i],snpsds[0].arrayIds[i]))
			missingRates.sort()
			missingRates.reverse()
			for (mrate,ecotype,array) in missingRates:
				if mrate>maxMissing:
					accessionsToRemove.append(ecotype)
					arraysToRemove.append(array)
		else:
			for i in range(0,len(snpsds[0].accessions)):
				missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i]))
			missingRates.sort()
			missingRates.reverse()
			for (mrate,ecotype) in missingRates:
				if mrate>maxMissing:
					accessionsToRemove.append(ecotype)


	if removeEcotypes:
		for removeEcotype in removeEcotypes:
			accessionsToRemove.append(str(int(removeEcotype)))
		print "Removing", len(accessionsToRemove), "accessions."
	if removeArray:
		if not arraysToRemove:
			arraysToRemove = []
		arraysToRemove.append(str(removeArray))
		print "Removing", len(arraysToRemove)," arrays."

	numAccessions = len(snpsds[0].accessions)
	sys.stderr.write("Removing accessions.")
	for snpsd in snpsds:
		snpsd.removeAccessions(accessionsToRemove,arrayIds=arraysToRemove)
		sys.stderr.write(".")
	print "\n", (numAccessions-len(snpsds[0].accessions)), "accessions out of "+str(numAccessions)+" were removed."
		
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
예제 #11
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=",
        "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=",
        "missingval=", "withArrayId=", "debug", "report", "help",
        "heterozygous2NA", "first192", "removeLer", "removeCol"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    inputFile = args[0]
    output_fname = None
    delim = ","
    missingVal = "NA"
    comparisonFile = None
    maxMissing = 1.0
    maxError = 1.0
    removeEcotypes = None
    removeArray = None
    removeIdentical = False
    onlyCommon = False
    debug = None
    report = None
    help = 0
    withArrayIds = 1
    first96 = False
    first192 = False
    heterozygous2NA = False
    removeLer = False
    removeCol = False

    for opt, arg in opts:
        if opt in ('-o'):
            output_fname = arg
        elif opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("--comparisonFile"):
            comparisonFile = arg
        elif opt in ("--maxError"):
            maxError = float(arg)
        elif opt in ("--maxMissing"):
            maxMissing = float(arg)
        elif opt in ("--heterozygous2NA"):
            heterozygous2NA = True
        elif opt in ("--removeEcotypeId"):
            removeEcotypes = arg.split(",")
            removeEcotypes = map(int, removeEcotypes)
        elif opt in ("--removeArrayId"):
            removeArray = int(arg)
        elif opt in ("--removeIdentical"):
            removeIdentical = True
        elif opt in ("--onlyCommon"):
            onlyCommon = True
        elif opt in ("--first96"):
            first96 = True
        elif opt in ("--first192"):
            first192 = True
        elif opt in ("--removeLer"):
            removeLer = True
        elif opt in ("--removeCol"):
            removeCol = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-b", "--debug"):
            debug = 1
        elif opt in ("-r", "--report"):
            report = 1
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2 or withArrayIds == 3

    import dataParsers
    snpsds = dataParsers.parseCSVData(inputFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=waid1)

    accessionsToRemove = []
    arraysToRemove = None

    if first96:
        import dataParsers
        d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',
                                                        user="******",
                                                        passwd="bamboo123")
        ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',
                                                       user="******",
                                                       passwd="bamboo123")
        print "Dictionaries loaded"
        names = []
        first96Names = []
        for i in range(0, len(snpsds[0].accessions)):
            ecotype = snpsds[0].accessions[i]
            arrayID = snpsds[0].arrayIds[i]
            names.append((arrayID, ecotd[ecotype], ecotype))
            if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0:
                accessionsToRemove.append(ecotype)
            else:
                first96Names.append(
                    (arrayID, d[ecotype][1], d[ecotype][0], ecotype))

        first96Names.sort()
        print "First 96 accessions, len:", len(first96Names), ":"
        for name in first96Names:
            print name
        names.sort()
        print "All accessions:"
        for name in names:
            print name
    elif first192:
        import phenotypeData
        ecotypes_192 = map(str, phenotypeData._getFirst192Ecotypes_())
        print ecotypes_192, snpsds[0].accessions
        for acc in snpsds[0].accessions:
            if acc not in ecotypes_192:
                accessionsToRemove.append(acc)
        print "found", len(ecotypes_192), '"192" ecotypes... removing', len(
            accessionsToRemove), "ecotypes."

    if removeLer:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[0]
    if removeCol:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[1]

    #Retrieve comparison list of accessions.  (Error rates for accessions)
    if (removeIdentical or maxError < 1.0) and comparisonFile:
        sys.stderr.write("Loading comparison file:")
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        res = []
        sys.stderr.write("Comparing accessions.")
        for i in range(0, len(snpsds)):
            res.append(snpsds[i].compareWith(snpsds2[i],
                                             withArrayIds=withArrayIds,
                                             verbose=False,
                                             heterozygous2NA=heterozygous2NA))
            sys.stderr.write(".")
        sys.stderr.write("\n")

        totalAccessionCounts = [0] * len(res[0][2])
        accErrorRate = [0] * len(res[0][2])
        for i in range(0, len(snpsds)):
            r = res[i]
            for j in range(0, len(r[2])):
                totalAccessionCounts[j] += r[6][j]
                accErrorRate[j] += r[3][j] * float(r[6][j])

        for i in range(0, len(accErrorRate)):
            accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i])

        accErrAndID = []
        if 0 < withArrayIds < 3:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i], r[5][i]))
        else:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i]))
        accErrAndID.sort()
        accErrAndID.reverse()

    #Figure out which accessions are too erroraneous
    if maxError < 1.0 and comparisonFile:
        if withArrayIds:
            arraysToRemove = []
            for (error, ecotype, array) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)

        else:
            for (error, ecotype) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)

    if removeIdentical and comparisonFile and withArrayIds:
        print "Locating identical accessions"
        accErrAndID.sort()
        if not arraysToRemove:
            arraysToRemove = []
        for accession in set(snpsds[0].accessions):
            if snpsds[0].accessions.count(accession) > 1:
                found = 0
                for (error, ecotype, array) in accErrAndID:
                    if ecotype == accession:
                        if found > 0:
                            accessionsToRemove.append(ecotype)
                            arraysToRemove.append(array)
                        found += 1

    if onlyCommon and comparisonFile:
        print "Locating accessions which are not shared"
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions)))
        if not arraysToRemove:
            arraysToRemove = []
        for i in range(0, len(snpsds[0].accessions)):
            acc = snpsds[0].accessions[i]
            if not acc in snpsds2[0].accessions:
                accessionsToRemove.append(acc)
                if 0 < withArrayIds < 3:
                    arraysToRemove.append(snpsds[0].arrayIds[i])

    if maxMissing < 1.0:
        missingCounts = [0] * len(snpsds[0].accessions)
        numSnps = 0
        for snpsd in snpsds:
            mc = snpsd.accessionsMissingCounts()
            numSnps += len(snpsd.positions)
            for i in range(0, len(snpsds[0].accessions)):
                missingCounts[i] += mc[i]

        missingRates = []
        if withArrayIds:
            arraysToRemove = []
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append(
                    (missingCounts[i] / float(numSnps),
                     snpsds[0].accessions[i], snpsds[0].arrayIds[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype, array) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)
        else:
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append((missingCounts[i] / float(numSnps),
                                     snpsds[0].accessions[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)

    if removeEcotypes:
        for removeEcotype in removeEcotypes:
            accessionsToRemove.append(str(int(removeEcotype)))
        print "Removing", len(accessionsToRemove), "accessions."
    if removeArray:
        if not arraysToRemove:
            arraysToRemove = []
        arraysToRemove.append(str(removeArray))
        print "Removing", len(arraysToRemove), " arrays."

    numAccessions = len(snpsds[0].accessions)
    sys.stderr.write("Removing accessions.")
    for snpsd in snpsds:
        snpsd.removeAccessions(accessionsToRemove, arrayIds=arraysToRemove)
        sys.stderr.write(".")
    print "\n", (
        numAccessions - len(snpsds[0].accessions)
    ), "accessions out of " + str(numAccessions) + " were removed."

    import snpsdata
    snpsdata.writeRawSnpsDatasToFile(output_fname,
                                     snpsds,
                                     chromosomes=[1, 2, 3, 4, 5],
                                     deliminator=delim,
                                     missingVal=missingVal,
                                     withArrayIds=waid1)
예제 #12
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "newBatch", "hostname=", "user="******"passwd=", "method=", "delim=",
        "missingval=", "withArrayId=", "callProbFile=", "help"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:t:d:m:a:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    hostname = env.env['default_lookup_db']  #'papaya.usc.edu'
    user = env.env['db_user']
    passwd = env.env['db_user']
    output_fname = None
    method = 1
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayId = False
    callProbFile = None
    newBatch = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-z", "--hostname"):
            hostname = arg
        elif opt in ("-u", "--user"):
            user = arg
        elif opt in ("-p", "--passwd"):
            passwd = arg
        elif opt in ("-o", ):
            output_fname = arg
        elif opt in ("-t", "--method"):
            method = int(arg)
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-a", "--withArrayId"):
            withArrayId = bool(arg)
        elif opt in ("--callProbFile"):
            callProbFile = arg
        elif opt in ("--newBatch"):
            newBatch = True

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    import dataParsers
    import snpsdata
    if callProbFile:
        snpsds = dataParsers.get250KDataFromDb(host=hostname,
                                               chromosomes=[1, 2, 3, 4, 5],
                                               methodId=method,
                                               user=user,
                                               passwd=passwd,
                                               withArrayIds=withArrayId,
                                               callProb=True,
                                               newBatch=newBatch)
        snpsdata.writeRawSnpsDatasToFile(output_fname,
                                         snpsds,
                                         chromosomes=[1, 2, 3, 4, 5],
                                         deliminator=delim,
                                         missingVal=missingVal,
                                         withArrayIds=withArrayId,
                                         callProbFile=callProbFile)
    else:
        snpsds = dataParsers.get250KDataFromDb(host=hostname,
                                               chromosomes=[1, 2, 3, 4, 5],
                                               methodId=method,
                                               user=user,
                                               passwd=passwd,
                                               withArrayIds=withArrayId,
                                               newBatch=newBatch)
        snpsdata.writeRawSnpsDatasToFile(output_fname,
                                         snpsds,
                                         chromosomes=[1, 2, 3, 4, 5],
                                         deliminator=delim,
                                         missingVal=missingVal,
                                         withArrayIds=withArrayId)