def calculatePrevalenceStatistic(data): """ Iterate over a state variable that has been created via the tabulateMarkerCounts function. This will perform the necessary calculations to populate the state variable with a prevalence statistic Prevalence can be calculated by taking the total genotyped/CN and dividing it by the sample size of the marker """ for dataElemList in generateCountList(data): # If we are working with a 'genotype' or 'Genotyping failure' or # 'No data' we want to skip prevalence calculations if not validateGenotypes(list(dataElemList[2])): continue sampleSize = dataElemList[4] markerGenotyped = dataElemList[5] # If our genotyped count is 0 we want to set prevalence to 0 # to avoid division by zero markerPrevalence = 0 if float(markerGenotyped) > 0: markerPrevalence = float(markerGenotyped) / sampleSize data[dataElemList[0]][dataElemList[1]][dataElemList[2]][dataElemList[3]]["prevalence"] = markerPrevalence
def incrementGenotypeCount(dict, metaKey, markerKey, genotype, groups, age): """ Increment the state dictionary with the three keys provided. If the key does not already exist in the dictionary the default value is set to 1 otherwise it is incremented by 1 If a group of ages is passed into this function we also want to categorize all of our increments """ dict.setdefault(metaKey, OrderedDict()).setdefault(markerKey, OrderedDict()).setdefault( genotype, OrderedDict() ).setdefault("All", OrderedDict()).setdefault("genotyped", 0) genotypeAll = dict[metaKey][markerKey][genotype]["All"]["genotyped"] genotypeAll += 1 # Initialize our sample size to 0 to avoid any errors dict[metaKey][markerKey].setdefault("sample_size", OrderedDict()).setdefault("All", 0) sampleAll = dict[metaKey][markerKey]["sample_size"]["All"] if validateGenotypes(genotype): sampleAll += 1 dict[metaKey][markerKey]["sample_size"]["All"] = sampleAll dict[metaKey][markerKey][genotype]["All"]["genotyped"] = genotypeAll # If our age key is not None we need to add this age group if groups: incrementCountsByAgeGroup(dict, metaKey, markerKey, genotype, groups, age)
def incrementCountsByAgeGroup(dict, metaKey, markerKey, genotype, groups, age): """ Initializes all age groups in our statistics dictionary and increments only the age groups where a row of data containing that age was found """ groupKey = None for group in groups: # The groups list should contain a list of age groups in the following # tuple format: # # [ (lower, upper, label), (lower, upper, label), .... ] # # We should always assume that our grouping will be lower <= age <= upper # and our group key will be returned as "lower - upper". # # The two fringe cases we will have to look out for will be (0, upper) # and (lower, 200) in these cases we are dealing with edge cases such as # (0, 1) and (12, 200) which would be represented as age < 1 and # age > 12 # # The third element in the tuple, 'label', will represent the key assigned # in the dictionary housing our statistics (lower, upper, label) = group dict[metaKey][markerKey]["sample_size"].setdefault(label, 0) dict[metaKey][markerKey][genotype].setdefault(label, OrderedDict()).setdefault("genotyped", 0) if age is not None: if lower is None: if float(age) < upper: groupKey = label if upper is None: if float(age) > lower: groupKey = label if lower is not None and upper is not None: if lower <= float(age) <= upper: groupKey = label if groupKey is not None: # Once again, hacky but we do not want to increment the sample size for a given # group if our genotype is 'Not genotyped' or 'Genotyping failure' if validateGenotypes(genotype): dict[metaKey][markerKey]["sample_size"][groupKey] += 1 dict[metaKey][markerKey][genotype][groupKey]["genotyped"] += 1