예제 #1
0
def plotOverlayingVectors(x, vectorList, main="", xlab="", ylab="",type=None, pch='20', xname="x", ynames=None):
    """
    Writes out a simple R string to plot the vectors..
    """
    if not ynames:
        ynames = ["y"]*len(vectorList)
    maxVal=[]
    minVal=[]
    for v in vectorList:
        maxVal.append(max(v))
        minVal.append(min(v))
    maxVal = max(maxVal)
    minVal = min(minVal)

    xmax = max(x)
    xmin = min(x)

    x = util.valListToStrList(x)
    rstr =""
    #rstr = "par(mfrow=c(1,1));\n"
    rstr += xname+" <- c("+",".join(x)+");\n"
    for i in range(0,len(vectorList)):
        y = vectorList[i]
        y = util.valListToStrList(y)
        rstr += ynames[i]+" <- c("+",".join(y)+");\n"
        if i!=0:
            rstr += "par(new=T);\n"
        rstr += 'plot('+xname+','+ynames[i]+',main="'+main+'",xlab="'+xlab+'",ylab="'+ylab+'", xlim=c('+str(xmin)+','+str(xmax)+'), ylim=c('+str(minVal)+','+str(maxVal)+'), col='+str(i+2)
        if type:
            rstr +=', type="'+type+'"'
        if pch:
            rstr +=', pch='+pch
        rstr += ')\n'
            
    return rstr
예제 #2
0
def plotVectors(x, vectorList, main="", xlab="", ylab="",type=None, xname="x", ynames=None):
    """
    Writes out a simple R string to plot the vectors..
    """
    if not ynames:
        ynames = ["y"]*len(vectorList)

    x = util.valListToStrList(x)
    rstr =""
    rstr = "par(mfrow=c("+str(len(vectorList))+",1));\n"
    rstr += xname+" <- c("+",".join(x)+");\n"
    for i in range(0,len(vectorList)):
        y = util.valListToStrList(vectorList[i])
        rstr += ynames[i]+" <- c("+",".join(y)+");\n"
        rstr += 'plot('+xname+','+ynames[i]+',pch=20, main="'+main+'",xlab="'+xlab+'",ylab="'+ylab+'"'
        if type:
            rstr += ', type="'+type+'"'
        rstr += ')\n'
            
    return rstr
예제 #3
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
        phenotypeFileType = 1
        impFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	withArrayIds = 1
	parallel = None
	logTransform = False
	parallelAll = False
	chunkSize = 250000
	round2Size = 5000
	nTrees = 15000
	nodeSize = None
	mem = "8g"
	skipSecondRound = True
	minMAF = 0.0

	for opt, arg in opts:
            if opt in ("-h", "--help"):
                help = 1
                print __doc__
            elif opt in ("-a","--withArrayId"):
                withArrayIds = int(arg)
            elif opt in ("-o","--rFile"):
                impFile = arg
            elif opt in ("--phenotypeFileType"):
                phenotypeFileType = int(arg)
            elif opt in ("--parallel"):
                parallel = arg
            elif opt in ("--parallelAll"):
                parallelAll = True
            elif opt in ("--logTransform"):
                logTransform = True
            elif opt in ("--secondRound"):
                skipSecondRound = False
            elif opt in ("-d","--delim"):
                delim = arg
            elif opt in ("--chunkSize"):
                chunkSize = int(arg)
            elif opt in ("--round2Size"):
                round2Size = int(arg)		
            elif opt in ("--nTrees"):
                nTrees = int(arg)
            elif opt in ("--nodeSize"):
                nodeSize = int(arg)
            elif opt in ("--mem"):
                mem = arg
            elif opt in ("-m","--missingval"):
                missingVal = arg
            elif opt in ("-m","--minMAF"):
                minMAF = float(arg)
            else:
                if help==0:
                    print "Unkown option!!\n"
                    print __doc__
                sys.exit(2)

        if len(args)<3 and not parallel:
            if help==0:
                print "Arguments are missing!!\n"
                print __doc__
            sys.exit(2)

	
	def runParallel(phenotypeIndex):
		#Cluster specific parameters
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		phenName = phed.getPhenotypeName(phenotypeIndex)
		phenName = phenName.replace("/","_div_")
		phenName = phenName.replace("*","_star_")
		impFileName = resultDir+"RF_"+parallel+"_"+phenName
		outFileName = impFileName
		shstr = """#!/bin/csh
#PBS -l walltime=120:00:00
"""
		shstr += "#PBS -l mem="+mem+"\n"
		shstr +="""
#PBS -q cmb
"""
		
		shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n"
		shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+""
		if nodeSize:
			shstr += " --nodeSize "+str(nodeSize)+" "
		if logTransform:
			shstr += " --logTransform "
		if not skipSecondRound:
			shstr += " --secondRound "
		shstr += " -a "+str(withArrayIds)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	#Nested function ends

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		if parallelAll:
			phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex = int(args[2])

	print "chunkSize:",chunkSize
	print "nTrees:",nTrees
	print "nodeSize:",nodeSize
	print "mem:",mem
	print "logTransform:",logTransform
	print "round2Size:",round2Size
	print "skipSecondRound:",skipSecondRound

	#Loading genotype data
	import dataParsers
	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)
	
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	phenotype = phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	numAcc = len(snpsds[0].accessions)

	#Load phenotype file
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping = []
	i = 0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc),i))
			i += 1
	phed.orderAccessions(accessionMapping)

	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotype)

        #Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)
		
	
      	#Converting format to 01
	import snpsdata
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	snpsds = newSnpsds
	
	#Writing files
	import tempfile
	if env.user=="bjarni":
		tempfile.tempdir='/tmp'
	(fId, phenotypeTempFile) = tempfile.mkstemp()
	os.close(fId)
	(fId, genotypeTempFile) = tempfile.mkstemp()
	os.close(fId)
	
	phed.writeToFile(phenotypeTempFile, [phenotype])	
	sys.stdout.write( "Phenotype file written\n")
	sys.stdout.flush()
	
	#Retain only the correct runchunk of data.
	chromasomes = []
	positions = []
	snps = []
	for i in range(0,len(snpsds)):
		snpsd = snpsds[i]
		positions += snpsd.positions
		snps += snpsd.snps
		chrList = [i+1]*len(snpsd.positions)
		chromasomes += chrList

	#Is the phenotype binary?
	binary = phed.isBinary(phenotypeIndex)
	import util
	impFile = impFile+".imp"
	rDataFile = impFile+".rData"
	rFile = impFile+".r"
	outRfile = rFile+".out"
	errRfile = rFile+".err"
	topImpFile = impFile+"_top"+str(chunkSize)+".imp"
	topRDataFile = impFile+"_top.rData"
	try:
		os.remove(impFile)    #Removing file if it already exits.
	except Exception:
		print "Couldn't remove",impFile
	try:
		os.remove(topImpFile) #Removing file if it already exits.
	except Exception:
		print "Couldn't remove",topImpFile
	for startIndex in range(0,len(positions),chunkSize):
		if startIndex+chunkSize>=len(positions):
			endIndex = len(positions)
		else:
			endIndex = startIndex+chunkSize

	        #Writing genotype data to file.
		tmpFile = open(genotypeTempFile,"w")
		for i in range(startIndex,endIndex):
			outStr =""
			snp = util.valListToStrList(snps[i])
			outStr += str(chromasomes[i])+","+str(positions[i])+","
			outStr += ",".join(snp)
			outStr += "\n"
			tmpFile.write(outStr)
		tmpFile.close()
			
		rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize)
		f = open(rFile,'w')
		f.write(rstr)
		f.close()
		#outRfile = rFile+"_"+str(startIndex/chunkSize)+".out"
		#errRfile = rFile+"_"+str(startIndex/chunkSize)+".err"
		print "Running model nr",startIndex/chunkSize,":"
		cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()
		os.system(cmdStr)
	print "Random forest output saved in", impFile
	
	if not skipSecondRound:
		#Run on the top 'chunkSize' number of hits.
		#loading the R output file.
		impF = open(impFile,"r")
		lines=impF.readlines()
		impF.close()
		impList = list()
		for i in range(1,len(lines)):
			line = lines[i]
			line.strip()
			l = line.split(",")
			impList.append( (float(l[2]),l[0],l[1],snps[i]) )
		impList.sort()
		impList.reverse()

		#Writing genotype data to file.
		tmpFile = open(genotypeTempFile,"w")
		for i in range(0,round2Size):
			outStr = ""
			snp = util.valListToStrList(impList[i][3])
			outStr += str(impList[i][1])+","+str(impList[i][2])+","
			outStr += ",".join(snp)
			outStr += "\n"
			tmpFile.write(outStr)
		tmpFile.close()
		rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize)
		f = open(rFile,'w')
		f.write(rstr)
		f.close()
		print "Running randomForest on the top importance scores:"
		cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()
		os.system(cmdStr)
예제 #4
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["delim=", "missingval=", "withArrayId=", "comparisonFile=", "debug", "report", "help"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	inputFile = args[0]
	output_fname = None
	delim = ", "
	missingVal = "NA"
	comparisonFile = None
	debug = None
	report = None
	help = 0
	withArrayIds = 0

	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("--comparisonFile"):
			comparisonFile = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	import dataParsers
        import snpsdata
        snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	

	#Calculating Error rates
	#if comparisonFile:
	#	snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
	#	for i in range(0,len(snpsds)):
                        #Compare ... and record relevant information...
                        #snpsds[i].compare filterBadSnps(snpsds2[i],maxError)
        #            pass

	#Calculating NA rates..
	print "Calculating NA rates"
	snpsNARates = []
	for i in range(0,len(snpsds)):
		snpsNARates += snpsds[i].getSnpsNArates()
	import util
	rstr = ""
	rstr += "snpsNARates <- c("+",".join(util.valListToStrList(snpsNARates))+")\n"
	rstr += 'hist(snpsNARates, xlab="NA rates", ylab="SNP frequency", breaks=60)'
	
	f = open(output_fname,"w")
	f.write(rstr)
	f.close()
예제 #5
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=",
        "withArrayId=", "logTransform", "phenotypeFileType=", "help",
        "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=",
        "secondRound", "minMAF="
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeFileType = 1
    impFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayIds = 1
    parallel = None
    logTransform = False
    parallelAll = False
    chunkSize = 250000
    round2Size = 5000
    nTrees = 15000
    nodeSize = None
    mem = "8g"
    skipSecondRound = True
    minMAF = 0.0

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("-o", "--rFile"):
            impFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--secondRound"):
            skipSecondRound = False
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("--chunkSize"):
            chunkSize = int(arg)
        elif opt in ("--round2Size"):
            round2Size = int(arg)
        elif opt in ("--nTrees"):
            nTrees = int(arg)
        elif opt in ("--nodeSize"):
            nodeSize = int(arg)
        elif opt in ("--mem"):
            mem = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-m", "--minMAF"):
            minMAF = float(arg)
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    def runParallel(phenotypeIndex):
        #Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        impFileName = resultDir + "RF_" + parallel + "_" + phenName
        outFileName = impFileName
        shstr = """#!/bin/csh
#PBS -l walltime=50:00:00
"""
        shstr += "#PBS -l mem=" + mem + "\n"
        shstr += """
#PBS -q cmb
"""

        shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n"
        shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str(
            chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str(
                mem) + " --round2Size " + str(round2Size) + ""
        if nodeSize:
            shstr += " --nodeSize " + str(nodeSize) + " "
        if logTransform:
            shstr += " --logTransform "
        if not skipSecondRound:
            shstr += " --secondRound "
        shstr += " -a " + str(withArrayIds) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    #Nested function ends

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(
                phenotypeDataFile, delimiter='\t')  #Get Phenotype data
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex)
        return
    else:
        phenotypeIndex = int(args[2])

    print "chunkSize:", chunkSize
    print "nTrees:", nTrees
    print "nodeSize:", nodeSize
    print "mem:", mem
    print "logTransform:", logTransform
    print "round2Size:", round2Size
    print "skipSecondRound:", skipSecondRound

    #Loading genotype data
    import dataParsers
    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=withArrayIds)

    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    phenotype = phed.getPhenIndex(phenotypeIndex)
    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    numAcc = len(snpsds[0].accessions)

    #Load phenotype file
    sys.stdout.write(
        "Removing accessions which do not have a phenotype value for " +
        phed.phenotypeNames[phenotype] + ".")
    sys.stdout.flush()
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    #Filter accessions which do not have the phenotype value.
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "Filtering phenotype data."
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    #Ordering accessions according to the order of accessions in the genotype file
    accessionMapping = []
    i = 0
    for acc in snpsds[0].accessions:
        if acc in phed.accessions:
            accessionMapping.append((phed.accessions.index(acc), i))
            i += 1
    phed.orderAccessions(accessionMapping)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotype)

#Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

#Converting format to 01
    import snpsdata
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData())
    print ""
    snpsds = newSnpsds

    #Writing files
    import tempfile
    if env.user == "bjarni":
        tempfile.tempdir = '/tmp'
    (fId, phenotypeTempFile) = tempfile.mkstemp()
    os.close(fId)
    (fId, genotypeTempFile) = tempfile.mkstemp()
    os.close(fId)

    phed.writeToFile(phenotypeTempFile, [phenotype])
    sys.stdout.write("Phenotype file written\n")
    sys.stdout.flush()

    #Retain only the correct runchunk of data.
    chromasomes = []
    positions = []
    snps = []
    for i in range(0, len(snpsds)):
        snpsd = snpsds[i]
        positions += snpsd.positions
        snps += snpsd.snps
        chrList = [i + 1] * len(snpsd.positions)
        chromasomes += chrList

    #Is the phenotype binary?
    binary = phed.isBinary(phenotypeIndex)
    import util
    impFile = impFile + ".imp"
    rDataFile = impFile + ".rData"
    rFile = impFile + ".r"
    outRfile = rFile + ".out"
    errRfile = rFile + ".err"
    topImpFile = impFile + "_top" + str(chunkSize) + ".imp"
    topRDataFile = impFile + "_top.rData"
    try:
        os.remove(impFile)  #Removing file if it already exits.
    except Exception:
        print "Couldn't remove", impFile
    try:
        os.remove(topImpFile)  #Removing file if it already exits.
    except Exception:
        print "Couldn't remove", topImpFile
    for startIndex in range(0, len(positions), chunkSize):
        if startIndex + chunkSize >= len(positions):
            endIndex = len(positions)
        else:
            endIndex = startIndex + chunkSize

    #Writing genotype data to file.
        tmpFile = open(genotypeTempFile, "w")
        for i in range(startIndex, endIndex):
            outStr = ""
            snp = util.valListToStrList(snps[i])
            outStr += str(chromasomes[i]) + "," + str(positions[i]) + ","
            outStr += ",".join(snp)
            outStr += "\n"
            tmpFile.write(outStr)
        tmpFile.close()

        rstr = _generateRScript_(genotypeTempFile,
                                 phenotypeTempFile,
                                 impFile,
                                 rDataFile,
                                 binary=binary,
                                 nTrees=nTrees,
                                 nodeSize=nodeSize)
        f = open(rFile, 'w')
        f.write(rstr)
        f.close()
        #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out"
        #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err"
        print "Running model nr", startIndex / chunkSize, ":"
        cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile
        sys.stdout.write(cmdStr + "\n")
        sys.stdout.flush()
        os.system(cmdStr)
    print "Random forest output saved in", impFile

    if not skipSecondRound:
        #Run on the top 'chunkSize' number of hits.
        #loading the R output file.
        impF = open(impFile, "r")
        lines = impF.readlines()
        impF.close()
        impList = list()
        for i in range(1, len(lines)):
            line = lines[i]
            line.strip()
            l = line.split(",")
            impList.append((float(l[2]), l[0], l[1], snps[i]))
        impList.sort()
        impList.reverse()

        #Writing genotype data to file.
        tmpFile = open(genotypeTempFile, "w")
        for i in range(0, round2Size):
            outStr = ""
            snp = util.valListToStrList(impList[i][3])
            outStr += str(impList[i][1]) + "," + str(impList[i][2]) + ","
            outStr += ",".join(snp)
            outStr += "\n"
            tmpFile.write(outStr)
        tmpFile.close()
        rstr = _generateRScript_(genotypeTempFile,
                                 phenotypeTempFile,
                                 topImpFile,
                                 topRDataFile,
                                 binary=binary,
                                 nTrees=nTrees,
                                 nodeSize=nodeSize)
        f = open(rFile, 'w')
        f.write(rstr)
        f.close()
        print "Running randomForest on the top importance scores:"
        cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile
        sys.stdout.write(cmdStr + "\n")
        sys.stdout.flush()
        os.system(cmdStr)