Пример #1
0
def replaceRelations(relations, FASTAHeaders = []):
	
	newRelations = []
	relationsSorted = stats.sortByIndex(relations, 1)
	FASTAHeadersSorted = stats.sortByIndex(FASTAHeaders, 0)
	
	
	if len(FASTAHeadersSorted) > 0:
		for relation in relationsSorted:
			element = relation[1]
			searchResult = stats.filterByElement(FASTAHeadersSorted, element, index = 0, sort = False)
			if len(searchResult) > 0:
				newRelations.append([relation[0], searchResult[0][1]])
	else:
		return relationsSorted
	
	return newRelations
Пример #2
0
def associateElements(inStats = "", uFile = "", relFile = ""):
	
	results = []
	
	relations = stats.loadRelationsFile(relFile)
	relations = stats.sortByIndex(relations, 0)
	
	statsData = stats.loadStatsDataFile(inStats)
	
	ZijList = []
	for element in statsData:
		ZijList.append([element[3], element[7]])
	
	theorList = []
	experList = []
	N = len(ZijList)
	for i in range(N):
		theorList.append([ZijList[i][0], ZijList[i][1], norm.cdf(float(ZijList[i][1]))])
		experList.append([ZijList[i][0], ZijList[i][1], (float(i) + 0.5) / float(N)])
	
	higherElements = stats.load2stringList(uFile, removeCommas = True)
	
	# WARNING! higherElements must be a list of lists
	# with each sublist being id, n, Z, FDR, X
    
    # begin: jmrc
	if not higherElements:
		sms = "ERROR: higherElements is empty. The higherElements must be a list of lists with each sublist being id, n, Z, FDR, X"
		sys.exit(sms)
    # end: jmrc
	
	elementList = []
	if higherElements[0] == ['id', 'Z', 'n']:
		# this means the list comes from SanXoTSqueezer
		# so the header and the extra columns have to be removed
		for element in higherElements[1:]:
			# switch to id, n, Z, FDR
			elementList.append([element[0], element[2], element[1], float("nan"), float("nan")])
	
	if higherElements[0] == ['id', 'n', 'Z', 'FDR']:
		# this means it does not contain X, so a nan is put on its place
		for element in higherElements[1:]:
			elementList.append([element[0], element[1], element[2], element[3], float("nan")])
		
	if higherElements[0] == ['id', 'n', 'Z', 'FDR', 'X']:
		for element in higherElements[1:]:
			elementList.append([element[0], element[1], element[2], element[3], element[4]])
	
	# otherwise
	if higherElements[0] != ['id', 'Z', 'n'] and higherElements[0] != ['id', 'n', 'Z', 'FDR'] and higherElements[0] != ['id', 'n', 'Z', 'FDR', 'X']:
		for element in higherElements:
			elementList.append([element[0], float("nan"), float("nan"), float("nan"), float("nan")])
		
	statsData = stats.sortByIndex(statsData, 7)
	
	relationsFirstColumn = stats.extractColumns(relations, 0)
	relationsSecondColumn = stats.extractColumns(relations, 1)
	experListFirstColumn = stats.extractColumns(experList, 0)
	
	for uElement in elementList:
		lowerElementList = []
		first = stats.firstIndex(relationsFirstColumn, uElement[0])
		
		if first > -1: # -1 means it is not in the list
			notInList = 0
			last = stats.lastIndex(relationsFirstColumn, uElement[0])
			lowerElements = relationsSecondColumn[first:last + 1] # "+1" is to include the last one
			for element in lowerElements:
				lowerIndex = stats.firstIndex(experListFirstColumn, element)
				
				if lowerIndex > -1: # -1 means it is not in the list
					lowerElementList.append(element)
				else:
					notInList += 1
				
			lowerElementList = stats.sortByIndex(lowerElementList, 0)
			
			results.append([uElement[0], lowerElementList])
			
		else:
			if len(uElement[0].strip()) > 0:
				results.append([uElement[0], None])

	return results, elementList, ""
Пример #3
0
def getMADDistribution(nextIdX,
                       mergedData,
                       k,
                       variance,
                       alpha,
                       medianSide=100,
                       showGraph=False,
                       verbose=False):

    MADconstant = 1.48260221850560  # *** 1 / DISTR.NORM.ESTAND.INV(3 / 4) get exact number
    MADDistribution = []
    distrWeight = []

    # inputSequences = extractColumns(input, 0)
    # outputSequences = extractColumns(output, 0)

    newlist = []
    for orow in nextIdX:
        sequence = orow[0]
        # it is important to avoid sorting to keep it fast
        # so in next line do not foget sort = False
        # this should arrive here already sorted
        scanListWithSequence = stats.filterByElement(mergedData,
                                                     sequence,
                                                     sort=False)

        if len(scanListWithSequence
               ) > 1:  # otherwise Xi = Xj --> Xi - Xj = 0 --> does not work
            for scanRow in scanListWithSequence:
                newrow = []
                weight = scanRow[3]  # the V
                degreesOfFreedom = len(scanListWithSequence)
                XiXj = scanRow[2] - orow[1]
                newrow.append(sequence)  # sequence = 0
                newrow.append(scanRow[1])  # scan number = 1
                newrow.append(XiXj)  # Xi - Xj = 2
                newrow.append(weight)  # weight = 3
                newrow.append(
                    len(scanListWithSequence))  # degrees of freedom = 4
                newrow.append(
                    fabs(XiXj) * sqrt(
                        float(degreesOfFreedom) /
                        (float(degreesOfFreedom - 1))))  # = 5
                newrow.append(0)  # space to save the median = 6
                newrow.append(0)  # space to save the MAD formula = 7

                newlist.append(newrow)

    newlist = stats.sortByIndex(newlist, 3)  # sort by weight

    # get median + rank
    nextlist = []
    counter = 0

    if len(newlist) < medianSide * 2:
        if verbose:
            print('Not enough data to perform statistics,')
            print('len(newlist) = %s, while medianSide = %s' %
                  (str(len(newlist)), str(medianSide)))
        sys.exit()

    for i in range(len(newlist))[medianSide:len(newlist) - medianSide]:
        window = newlist[i - medianSide:i + medianSide + 1]
        median = stats.medianByIndex(window, 5)
        newlist[i][6] = median

    # fill the borders
    for i in range(len(newlist))[:medianSide]:
        newlist[i][6] = newlist[medianSide + 1][6]

    for i in range(len(newlist))[len(newlist) - medianSide:]:
        newlist[i][6] = newlist[len(newlist) - medianSide - 1][6]

    # fill MAD formula
    for i in range(len(newlist)):
        newlist[i][7] = 1 / (MADconstant * newlist[i][6])**2
        MADDistribution.append(newlist[i][7])
        distrWeight.append(newlist[i][3])

    if verbose:
        print('k = %f, var = %f' % (k, variance))

    return MADDistribution, distrWeight
Пример #4
0
def getMADDistribution(nextIdX,
						mergedData,
						k,
						variance,
						alpha,
						medianSide = 100,
						showGraph = False,
						verbose = False):
	
	MADconstant = 1.48260221850560 # *** 1 / DISTR.NORM.ESTAND.INV(3 / 4) get exact number
	MADDistribution = []
	distrWeight = []

	# inputSequences = extractColumns(input, 0)
	# outputSequences = extractColumns(output, 0)
	
	newlist = []
	for orow in nextIdX:
		sequence = orow[0]
		# it is important to avoid sorting to keep it fast
		# so in next line do not foget sort = False
		# this should arrive here already sorted
		scanListWithSequence = stats.filterByElement(mergedData, sequence, sort = False)
		
		if len(scanListWithSequence) > 1: # otherwise Xi = Xj --> Xi - Xj = 0 --> does not work
			for scanRow in scanListWithSequence:
				newrow = []
				weight = scanRow[3] # the V
				degreesOfFreedom = len(scanListWithSequence)
				XiXj = scanRow[2] - orow[1]
				newrow.append(sequence) # sequence = 0
				newrow.append(scanRow[1]) # scan number = 1
				newrow.append(XiXj) # Xi - Xj = 2
				newrow.append(weight) # weight = 3
				newrow.append(len(scanListWithSequence)) # degrees of freedom = 4
				newrow.append(fabs(XiXj) * sqrt(float(degreesOfFreedom) / (float(degreesOfFreedom - 1)))) # = 5
				newrow.append(0) # space to save the median = 6
				newrow.append(0) # space to save the MAD formula = 7

				newlist.append(newrow)
	
	newlist = stats.sortByIndex(newlist, 3) # sort by weight
	
	# get median + rank
	nextlist = []
	counter = 0
	
	if len(newlist) < medianSide * 2:
		if verbose:
			print('Not enough data to perform statistics,')
			print('len(newlist) = %s, while medianSide = %s' % (str(len(newlist)), str(medianSide)))
		sys.exit()
	
	for i in range(len(newlist))[medianSide:len(newlist) - medianSide]:
		window = newlist[i - medianSide:i + medianSide + 1]
		median = stats.medianByIndex(window, 5)
		newlist[i][6] = median

	# fill the borders
	for i in range(len(newlist))[:medianSide]:
		newlist[i][6] = newlist[medianSide + 1][6]
	
	for i in range(len(newlist))[len(newlist) - medianSide:]:
		newlist[i][6] = newlist[len(newlist) - medianSide - 1][6]

	# fill MAD formula
	for i in range(len(newlist)):
		newlist[i][7] = 1 / (MADconstant * newlist[i][6]) ** 2
		MADDistribution.append(newlist[i][7])
		distrWeight.append(newlist[i][3])

	if verbose:
		print('k = %f, var = %f' % (k, variance))

	return MADDistribution, distrWeight
Пример #5
0
def main(options, programName, programVersion):

## REGION: DEFAULT VALUES AND VARIABLE ACQUISITION

	# basic default info
	logFile = ""
	analysisName = ""
	analysisFolder = ""
	defaultAnalysisName = programName.lower()
	
	relFile = ""
	DBFile = ""
	FASTAFile = ""
	previousFile = ""
	accNumCol = 1
	catCol = 2
	catPrefix = ""
	header = "idsup\tidinf"
	
	previousList = []
	
	# default extensions
	defaultTableExtension = ".tsv"
	defaultTextExtension = ".txt"
	
	# default file names	
	defaultLogFile = "logFile"
	defaultRelFile = "rels"
	
	# basic log file
	logList = [[programName + " " + programVersion], ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]]

	# parsing arguments from commandline
	options.add_argument("-a", "--analysis", type = str, default = "", required = True, help = "Use a prefix for the output files.")
	options.add_argument("-p", "--place", type = str, default = "", required = True, help = "To use a different common folder for the output files. If this is not provided, the the folder used will be the same as the FASTA file folder.")
	options.add_argument("-L", "--logfile", type = str, default = "", required = False, help = "To use a non-default name for the log file.")
	options.add_argument("-d", "--dbfile", type = str, default = "", required = True, help = "The input file containing accession numbers and categories.")
	options.add_argument("-x", "--previousfile", type = str, default = "", required = False, help = "An optional relation file to which concatenate resulting relations (if omitted, a new file will be produced).")
	options.add_argument("-q", "--accnumcol", type = str, default = "1", required = False, help = "Column where accession numbers of genes/proteins are. First column is 1. Default is 1.")
	options.add_argument("-c", "--categorycol", type = str, default = "2", required = False, help = "Column where categories are. First column is 1. Default is 2.")
	options.add_argument("-f", "--prefix", type = str, default = "", required = False, help = "Prefix to add to all categories found in this parsing (such as \"GO-full_\", \"Panther_\", or \"KEGG=2017-01-10_\".")
	options.add_argument("--fasta", type = str, default = "", required = False, help = "FASTA file contaning the identifiers we want to replace by FASTA headers in the final file. Note that identifiers not appearing in this FASTA file will be removed from the final list.")
	# add string with category separator
	# add string with accNum separator
	
	
	# *-*-* add easter egg if wanted
	
	arguments = options.parse_args()
	
	# copying parsed arguments
	# copy any arguments used
	if len(arguments.analysis) > 0: analysisName = arguments.analysis
	if len(arguments.place) > 0: analysisFolder = arguments.place
	if len(arguments.logfile) > 0: logFile = arguments.logfile
	if len(arguments.dbfile) > 0: DBFile = arguments.dbfile
	if len(arguments.fasta) > 0: FASTAFile = arguments.fasta
	if len(arguments.previousfile) > 0: previousFile = arguments.previousfile
	if len(arguments.accnumcol) > 0: accNumCol = int(arguments.accnumcol)
	if len(arguments.categorycol) > 0: catCol = int(arguments.categorycol)
	if len(arguments.prefix) > 0: catPrefix = arguments.prefix
	
## END REGION: DEFAULT VALUES AND VARIABLE ACQUISITION
## **********************************************************
## REGION: FILE NAMES SETUP

	if len(analysisName) == 0:
		if len(DBFile) > 0:
			analysisName = os.path.splitext(os.path.basename(DBFile))[0]
		else:
			analysisName = defaultAnalysisName

	if len(os.path.dirname(analysisName)) > 0:
		analysisNameFirstPart = os.path.dirname(analysisName)
		analysisName = os.path.basename(analysisName)
		if len(analysisFolder) == 0:
			analysisFolder = analysisNameFirstPart
			
	# input

	if len(os.path.dirname(DBFile)) == 0:
		DBFile = os.path.join(analysisFolder, DBFile)
	
	if len(previousFile) > 0:
		if len(os.path.dirname(previousFile)) == 0:
			previousFile = os.path.join(analysisFolder, previousFile)
	
	if len(FASTAFile) > 0:
		if len(os.path.dirname(FASTAFile)) == 0:
			FASTAFile = os.path.join(analysisFolder, FASTAFile)
		
	# output
	
	if len(logFile) == 0:
		logFile = os.path.join(analysisFolder, analysisName + "_" + defaultLogFile + defaultTextExtension)
	if len(os.path.dirname(logFile)) == 0 and len(os.path.basename(logFile)) > 0:
		logFile = os.path.join(analysisFolder, logFile)
		
	if len(relFile) == 0:
		relFile = os.path.join(analysisFolder, analysisName + "_" + defaultRelFile + defaultTableExtension)
	if len(os.path.dirname(relFile)) == 0 and len(os.path.basename(relFile)) > 0:
		relFile = os.path.join(analysisFolder, relFile)

	logList.append([""])
	logList.append(["Input table with categories and proteins: " + DBFile])
	if len(previousFile) > 0:
		logList.append(["Previous file to which new qc relations are added: " + previousFile])
	if len(FASTAFile) > 0:
		logList.append(["FASTA file to replace identifiers for FASTA headers: " + FASTAFile])
	logList.append(["Category column: %i, protein column: %i" % (catCol, accNumCol)])
	logList.append(["Prefix added to categories: " + catPrefix])
	logList.append(["Output relations file: " + relFile])
	logList.append(["Output log file: " + logFile])
	logList.append([""])

## END REGION: FILE NAMES SETUP			
## **********************************************************
## REGION: PROGRAM BASIC STRUCTURE

	if len(previousFile) > 0: #otherwise, previousList = []
		previousList = stats.load2stringList(previousFile, removeCommas = True)
		header = ""

	AccNum2FASTAHeader = getFASTAHeaders(FASTAFile)
	gc.collect()
	
	DBList = stats.load2stringList(DBFile, removeCommas = True)
	newRelations = getRelations(bigTable = DBList,
							qCol = accNumCol,
							cCol = catCol,
							cPrefix = catPrefix,
							FASTAHeaders = AccNum2FASTAHeader)
	newRelationsSorted = stats.sortByIndex(newRelations, 0)
	relationList = previousList + newRelationsSorted
	
	gc.collect()
							

## END REGION: PROGRAM BASIC STRUCTURE
## **********************************************************
## REGION: SAVING FILES
	
	try:
		stats.saveFile(relFile, relationList, header)
		
		logList.append(["Everything went fine."])
		stats.saveFile(logFile, logList, "LOG FILE")
	except getopt.GetoptError:
		logList.append(["Error."])
		stats.saveFile(logFile, logList, "LOG FILE")
		sys.exit(2)
Пример #6
0
def main(argv):
	
	version = "v0.05"
	analysisName = ""
	analysisFolder = ""
	logFile = ""
	
	# in data
	prefix = ""
	extraPrefix = ""
	medianTag = "med"
	
	# default filenames
	defaultInfoFileSuffix = "_infoFile.txt"
	defaultLogFile = "logFile"
	defaultAnalysisName = "medianSelection"
	
	# default extensions
	defaultTableExtension = ".xls"
	defaultTextExtension = ".txt"
	
	verbose = True
	logList = [["Anselmo " + version], ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]]

	try:
		opts, args = getopt.getopt(argv, "a:p:f:g:m:L:hH", ["analysis=", "folder=",  "prefix=", "extraprefix=", "mediantag=", "logfile=", "help", "egg", "easteregg"])
	except getopt.GetoptError:
		logList.append(["Error while getting parameters."])
		stats.saveFile(infoFile, logList, "INFO FILE")
		sys.exit(2)

	if len(opts) == 0:
		printHelp(version, True)
		sys.exit()

	for opt, arg in opts:
		if opt in ("-a", "--analysis"):
			analysisName = arg
		elif opt in ("-p", "--place", "--folder"):
			analysisFolder = arg
		elif opt in ("-f", "--prefix"):
			prefix = arg
		elif opt in ("-g", "--extraprefix"):
			extraPrefix = arg
		elif opt in ("-L", "--logfile"):
			logFile = arg
		elif opt in ("-m", "--mediantag"):
			medianTag = arg
		elif opt in ("-h", "--help"):
			printHelp(version)
			sys.exit()
		elif opt in ("-H", "--advanced-help"):
			printHelp(version, advanced = True)
			sys.exit()
		elif opt in ("--egg", "--easteregg"):
			easterEgg()
			sys.exit()
	
# REGION: FILE NAMES SETUP
			
	if len(analysisName) == 0:
		analysisName = defaultAnalysisName

	if len(os.path.dirname(analysisName)) > 0:
		analysisNameFirstPart = os.path.dirname(analysisName)
		analysisName = os.path.basename(analysisName)
		if len(analysisFolder) == 0:
			analysisFolder = analysisNameFirstPart
	
	# next "if" disables extra copy when extraPrefix is same as prefix
	if len(extraPrefix) > 0 and extraPrefix == prefix:
		extraPrefix = ""
	# input
	
	# output
	
	if len(logFile) == 0:
		logFile = os.path.join(analysisFolder, analysisName + "_" + defaultLogFile + defaultTextExtension)
	
	##logList.append(["Median variance = " + "poner***"])

# END REGION: FILE NAMES SETUP
	
	# get infoFile list
	infoFileList = glob.glob(os.path.join(analysisFolder, prefix + "*" + defaultInfoFileSuffix))
	
	logList.append([])
	logList.append(["Folder = " + analysisFolder])
	logList.append([])
	logList.append(["Info files with prefix \"%s\"" % prefix])
	
	varList = []
	for varFile in infoFileList:
		variance, varianceOk = stats.extractVarianceFromVarFile(varFile, verbose = False)
		if varianceOk:
			varList.append([varFile, variance])
	
	# get info file with median variance
	
	varList = stats.sortByIndex(varList, 1)
	medianVariance = stats.medianByIndex(varList, 1)
	
	medianIndex = getMedianIndex(varList = varList, variance = medianVariance)
	
	for element in varList:
		if element[0] == varList[medianIndex][0]:
			logList.append(["%s, variance = %f [taken]" % (os.path.basename(element[0]), element[1])])
		else:
			logList.append(["%s, variance = %f" % (os.path.basename(element[0]), element[1])])
	
	# get prefix of median experiment
	
	medianInfoFile = os.path.basename(varList[medianIndex][0])
	randTag = medianInfoFile[len(prefix):len(medianInfoFile) - len(defaultInfoFileSuffix)]
	medianPrefix = prefix + randTag
	extraMedianPrefix = ""
	if len(extraPrefix) > 0: extraMedianPrefix = extraPrefix + randTag
		
	
	# get file list with specific prefix
	medianExperimentFileList = glob.glob(os.path.join(analysisFolder, medianPrefix + "*.*"))
	extraPrefixFileList = []
	if len(extraMedianPrefix) > 0:
		extraPrefixFileList = glob.glob(os.path.join(analysisFolder, extraMedianPrefix + "*.*"))
	
	# copy files including median tag
	extraLogList = copyFilesWithPrefix(fileList = medianExperimentFileList,
				folder = analysisFolder,
				prefix = prefix,
				message = "Renamed files:",
				tag = medianTag)
	logList.extend(extraLogList)
	
	if len(extraPrefixFileList) > 0:
		extraLogList = copyFilesWithPrefix(fileList = extraPrefixFileList,
					folder = analysisFolder,
					prefix = extraPrefix,
					message = "Renamed extra files:",
					tag = medianTag)
		logList.extend(extraLogList)
	
	# save logFile
	
	stats.saveFile(logFile, logList, "INFO FILE")
Пример #7
0
def getRels(qcInputFile = "", listChangingCats = [], qcInputNoOutsFile = "", modeSanXoTSieve = "newWay", caseSensitive = True, outlierTag = "out"):

	qcInputRawList = []
	qcInput = []
	qcInputNoOutsRawList = []
	qcInputNoOuts = []
	numRelsChangingCats = 0
	numOutliersChangingCats = 0
	numOutliersNonChangingCats = 0
	
	# lists of lists for filterByElement, needed to speed it up
	qcInputSortedList = []
	qcInputNoOutsSortedList = []
	listChangingCatsList = []
	listChangingCatsSortedList = []

	# when no qcInputFileNoOuts file is present, the newWay option is used
	# this has already been sorted previously, but just in case...
	if len(qcInputNoOutsFile) == 0: modeSanXoTSieve = "newWay"

	qcInputRawList = stats.loadStatsDataFile(qcInputFile, FDRasText = True, ZasText = True, includeTags = True)
	
	# next line is needed to nest list within list and make it work with the filterByElement method
	for cat in listChangingCats:
		if caseSensitive:
			listChangingCatsList.append([cat])
		else:
			listChangingCatsList.append([cat.lower()])
			
	# important NOT to sort listChangingCats, as this is not a list of lists and
	# sorting would only affect the first character instead of the first string
	listChangingCatsSortedList = stats.sortByIndex(listChangingCatsList, 0)	
	
	# get list of rels
	# next line is needed to nest list within list and make it work with the filterByElement method
	for qc in qcInputRawList:
		if caseSensitive:
			qcInput.append([[qc[0], qc[3], qc[9]]])
		else:
			qcInput.append([[qc[0].lower(), qc[3].lower(), qc[9]]])
			
	qcInputSortedList = stats.sortByIndex(qcInput, 0)
	
	if modeSanXoTSieve == "newWay":

		for qc in qcInputSortedList:
		
			if len(stats.filterByElement(listChangingCatsSortedList, qc[0][0], sort = False)) > 0:
				# get list of rels pointing to changing cats
				# get outlier rels
				numRelsChangingCats += 1
				if stats.tagIsPresent(qc[0][2], outlierTag):
					numOutliersChangingCats += 1
					
			else:
				# relations pointing to non changing cats
				if stats.tagIsPresent(qc[0][2], outlierTag):
					# outliers pointing to non changing cats
					numOutliersNonChangingCats += 1

	if modeSanXoTSieve == "oldWay":
	
		# quitar si sale bien sacándolo fuera
		
		# # next line is needed to nest list within list and make it work with the filterByElement method
		# for qc in qcInputRawList:
			# # if modeSanXoTSieve == "oldWay" and len(qc)
			# if caseSensitive:
				# qcInput.append([[qc[0], qc[3]]])
			# else:
				# qcInput.append([[qc[0].lower(), qc[3].lower()]])
				
		# qcInputSortedList = stats.sortByIndex(qcInput, 0)
		
		if len(qcInputNoOutsFile) > 0:
			qcInputNoOutsRawList = stats.loadStatsDataFile(qcInputNoOutsFile, FDRasText = True, ZasText = True, includeTags = False)
			
			# next line is needed to nest list within list and make it work with the filterByElement method
			for qcno in qcInputNoOutsRawList:
				if caseSensitive:
					qcInputNoOuts.append([[qcno[0], qcno[3]]])
				else:
					qcInputNoOuts.append([[qcno[0].lower(), qcno[3].lower()]])
				
			qcInputNoOutsSortedList = stats.sortByIndex(qcInputNoOuts, 0)
			

		
		print
		print "calculating with %i relations and %i changing categories..." % (len(qcInputSortedList), len(listChangingCats))
		
		for qc in qcInputSortedList:
			
			# better do not use something like "if x in list..." because that is quite slow
			if len(stats.filterByElement(listChangingCatsSortedList, qc[0][0], sort = False)) > 0:
				# is the category qc[0] in listChangingCatsSorted? If no --> 0
				# this relation points to a changing category
				numRelsChangingCats += 1
				if len(stats.filterByElement(qcInputNoOutsSortedList, qc[0][0:2], sort = False)) == 0:
					# is the relation qc in qcInputNoOuts? If no --> 0
					# this relation is an outlier in a changing category
					# the [0:2] part is to remove the space for tags
					numOutliersChangingCats += 1
			else:
				# this relation points to a non-changing category
				if len(stats.filterByElement(qcInputNoOutsSortedList, qc[0][0:2], sort = False)) == 0:
					# is the relation qc in qcInputNoOuts? If no --> 0
					# this relation is an outlier in a non-changing category
					numOutliersNonChangingCats += 1
					
	return numRelsChangingCats, numOutliersChangingCats, numOutliersNonChangingCats
Пример #8
0
def main(argv):

    version = "v0.14"
    verbose = False
    analysisName = ""
    defaultAnalysisName = "squeeze"
    analysisFolder = ""
    # parametres
    minimumElements = 2
    maximumElements = 1e6
    maximumFDR = 0.05
    minimumZ = 0.0  # take all by default
    filterByFDR = True  # if false, then it filters by abs(Z)
    # input files
    lowerStats = ""
    higherStats = ""
    defaultLowerStatsFile = "lower"
    defaultHigherStatsFile = "upper"
    defaultTableExtension = ".tsv"
    defaultTextExtension = ".txt"
    defaultGraphExtension = ".png"
    defaultOutputFile = "outList"
    defaultLogFile = "logFile"
    # output files
    logFile = ""
    outputFile = ""
    logList = [["SanXoTSqueezer " + version],
               ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]]

    try:
        opts, args = getopt.getopt(argv, "a:l:L:o:p:u:n:N:f:z:h", [
            "analysis=", "lowerstats=", "logfile=", "outputfile=", "place=",
            "minelements=", "maxelements=", "fdr=", "sigmas=", "help"
        ])
    except getopt.GetoptError:
        logList.append(["Error while getting parameters."])
        sys.exit(2)

    if len(opts) == 0:
        printHelp(version)
        sys.exit()

    for opt, arg in opts:
        if opt in ("-a", "--analysis"):
            analysisName = arg
        if opt in ("-p", "--place", "--folder"):
            analysisFolder = arg
        if opt in ("-l", "--lowerstats"):
            lowerStats = arg
        if opt in ("-u", "--upperstats"):
            higherStats = arg
        elif opt in ("-L", "--logfile"):
            logFile = arg
        elif opt in ("-o", "--outputfile"):
            outputFile = arg
        elif opt in ("-n", "--minelements"):
            minimumElements = int(arg)
        elif opt in ("-N", "--maxelements"):
            maximumElements = int(arg)
        elif opt in ("-f", "--fdr"):
            maximumFDR = float(arg)
        elif opt in ("-z", "--sigmas"):
            filterByFDR = False
            minimumZ = float(arg)
        elif opt in ("-h", "--help"):
            printHelp(version)
            sys.exit()

# REGION: FILE NAMES SETUP

    if len(analysisName) == 0:
        if len(lowerStats) > 0:
            analysisName = os.path.splitext(os.path.basename(lowerStats))[0]
        else:
            analysisName = defaultAnalysisName

    if len(os.path.dirname(analysisName)) > 0:
        analysisNameFirstPart = os.path.dirname(analysisName)
        analysisName = os.path.basename(analysisName)
        if len(analysisFolder) == 0:
            analysisFolder = analysisNameFirstPart

    if len(lowerStats) > 0 and len(analysisFolder) == 0:
        if len(os.path.dirname(lowerStats)) > 0:
            analysisFolder = os.path.dirname(lowerStats)

    # input
    if len(lowerStats) == 0:
        lowerStats = os.path.join(
            analysisFolder,
            analysisName + "_" + defaultLowerStatsFile + defaultTableExtension)

    if len(higherStats) == 0:
        higherStats = os.path.join(
            analysisFolder, analysisName + "_" + defaultHigherStatsFile +
            defaultTableExtension)

    if len(os.path.dirname(lowerStats)) == 0 and len(analysisFolder) > 0:
        lowerStats = os.path.join(analysisFolder, lowerStats)

    if len(os.path.dirname(higherStats)) == 0 and len(analysisFolder) > 0:
        higherStats = os.path.join(analysisFolder, higherStats)

    # output
    if len(outputFile) == 0:
        outputFile = os.path.join(
            analysisFolder,
            analysisName + "_" + defaultOutputFile + defaultTableExtension)

    if len(logFile) == 0:
        logFile = os.path.join(
            analysisFolder,
            analysisName + "_" + defaultLogFile + defaultTextExtension)

    if len(os.path.dirname(outputFile)) == 0 and len(
            os.path.basename(outputFile)) > 0:
        outputFile = os.path.join(analysisFolder, outputFile)

    if len(os.path.dirname(logFile)) == 0 and len(
            os.path.basename(logFile)) > 0:
        logFile = os.path.join(analysisFolder, logFile)

    logList.append([""])
    logList.append(["Lower input stats file: " + lowerStats])
    logList.append(["Higher input stats file: " + higherStats])
    logList.append(["Output list: " + outputFile])
    logList.append(["Output log file: " + logFile])
    logList.append(
        ["Minimum elements in higher category: " + str(minimumElements)])
    logList.append(
        ["Maximum elements in higher category: " + str(maximumElements)])
    logList.append(["Minimum z: " + str(minimumZ)])
    logList.append([""])

    # pp.pprint(logList)
    # sys.exit()

    # END REGION: FILE NAMES SETUP

    try:
        lowerData = stats.loadStatsDataFile(lowerStats)
        logList.append(["Lower data files correctly loaded."])
    except getopt.GetoptError:
        logList.append(["Error while getting lower data files."])
        stats.saveFile(logFile, logList, "LOG FILE")
        sys.exit(2)

    try:
        higherData = stats.loadStatsDataFile(higherStats)
        logList.append(["Higher data files correctly loaded."])
    except getopt.GetoptError:
        logList.append(["Error while getting higher data files."])
        stats.saveFile(logFile, logList, "LOG FILE")
        sys.exit(2)

    try:
        filteredList = filterNFDRorZ(lowerData,
                                     higherData,
                                     minN=minimumElements,
                                     maxN=maximumElements,
                                     minZ=minimumZ,
                                     maxFDR=maximumFDR,
                                     useFDR=filterByFDR)
        filteredList = stats.sortByIndex(filteredList, 1)
        logList.append(["Data correctly filtered."])
    except getopt.GetoptError:
        logList.append(["Error while getting data filtered by N and Z."])
        stats.saveFile(logFile, logList, "LOG FILE")
        sys.exit(2)

    try:
        stats.saveFile(outputFile, filteredList, "id\tn\tZ\tFDR\tX")
        logList.append(["Output data correctly saved."])
    except getopt.GetoptError:
        logList.append(["Error while saving output data."])
        stats.saveFile(logFile, logList, "LOG FILE")
        sys.exit(2)

    stats.saveFile(logFile, logList, "LOG FILE")