def data(Arguments): ''' Get info for the files in your MOCA.data directory. ''' Path = get_path("MOCA.data") + "/" Files = os.listdir(Path) if not len(Files): print "You set 'Reports = Data', yet your MOCA.data directory is empty" print "Try processing some data first. Exiting..." for File in Files: print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print " " print "Processed data file =", Path + File print " " Report = cPickle.load(open(Path + File, "rb"))["Report"] Entries = Report["Entries"] for Entry in Entries: if type(Report[Entry]) == list: print "\t", Entry, "\t", " ".join(map(str, Report[Entry])) else: print "\t", Entry, "\t", Report[Entry] print " " print " " return
def results(Arguments): ''' Get info for the files in your MOCA.results directory ''' Path = get_path("MOCA.results") + "/" Files = os.listdir(Path) if not len(Files): print "You set 'Reports = Results', yet your MOCA.results directory is empty" print "Try runnning some MOCA calculations first. Exiting..." exit() for File in Files: print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print " " print "MOCA results file =", Path + File print " " Report = cPickle.load(open(Path + File, "rb"))["Report"] Entries = Report["Entries"] for Entry in Entries: if type(Report[Entry]) == list: print "\t", Entry, "\t", " ".join(map(str, Report[Entry])) else: print "\t", Entry, "\t", Report[Entry] print " " print " " return
def cross_validation(Arguments, \ Labels, Features, Variates, \ Markers, Phenotype, CrossValidations, Cases, Controls): ''' Labels, Features, and Variates have the usual meanings. Markers are any data you give MOCA that you didn't tell it is the Phenotype. Phenotype is the thing your trying to predict (i.e., select markers for)--only one Phenotype per LeaveSomeOut run please. Interger numnber of CrossValidations to do (e.g., leave ONE out or TEN-fold cross validation) Cases and Controls are dictionaries specifying which labels go with which CrossValidation. For each cross-validation this pickles a feature matrix of the following format Label1 Label2 Label3..... Phenotype 0 0 1..... (Setwork1 InteractionType) 1 0 1..... (Setwork2 InteractionType) 1 1 0..... (Setwork3 InteractionType) 0 0 0..... . . . . . ''' #Read in the setwork optimization paraments Trials = int(Arguments.Optimization[0]) RepopulateFrequency = int(Arguments.Optimization[1]) PercentToRepopulate = float(Arguments.Optimization[2]) #Read in the Boolean set parameters UnionFeatures = int(Arguments.BooleanSets[0]) IntersectionFeatures = int(Arguments.BooleanSets[1]) DifferenceFeatures = int(Arguments.BooleanSets[2]) #Multiprocess mode support if called. Cross-validation is so compute intensive that you #can only do this for one Phenotype at a time, and the different cross-validations are #distributed to different processors if MultiProcessMode is called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for CrossValidation in range(CrossValidations)[Node::TotalNodes]: #Split the data is directed TrainLabels = list(set(Labels) - set(Cases[CrossValidation] + Controls[CrossValidation])) TrainVariates = get_ordered_matrix(TrainLabels, Labels, Variates) #Get setworks from the training data PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \ get_setworks(Arguments, \ Features, TrainVariates, Phenotype, \ Markers, Markers, Markers, \ Trials, RepopulateFrequency, PercentToRepopulate, \ UnionFeatures, IntersectionFeatures, DifferenceFeatures) #Make the cross-validation matrix. First row is the case-control-label header CrossValidationFeatureMatrix = [Cases[CrossValidation] + Controls[CrossValidation]] PhenotypeVector = [1 for Case in Cases[CrossValidation]] + [0 for Control in Controls[CrossValidation]] CrossValidationFeatureMatrix.append(PhenotypeVector) #second row is the phenotype vector #We only need the intersection of unique setworks passing the FDR threshold QValues = p_adjust(PValues, Arguments.CorrectionMethod) QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \ if QValues[PValue] < Arguments.FDR]) Barcodes = list(set.intersection(set(Setworks.keys()), set(QValues.keys()))) #finally, if we desire we can filter by fdr at this stage. We could do it later, but we'll get a bigger Pickle now. Barcodes = [Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments)] Results = {} Results["PValues"] = dict([(Barcode, PValues[Barcode]) for Barcode in Barcodes]) Results["QValues"] = dict([(Barcode, QValues[Barcode]) for Barcode in Barcodes]) Results["Performances"] = dict([(Barcode, Performances[Barcode]) for Barcode in Barcodes]) Results["Interactions"] = dict([(Barcode, Interactions[Barcode]) for Barcode in Barcodes]) Results["FeatureVectors"] = dict([(Barcode, FeatureVectors[Barcode]) for Barcode in Barcodes]) Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0]) for Barcode in Barcodes]) Results["IntersectionFeatures"] = dict([(Barcode, Setworks[Barcode][1]) for Barcode in Barcodes]) Results["DifferenceFeatures"] = dict([(Barcode, Setworks[Barcode][2]) for Barcode in Barcodes]) Results["Report"] = make_report(Cases, Controls, Phenotype, CrossValidation, Arguments) Results["Barcodes"] = Barcodes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join(["Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization)), ".Validation" + str(CrossValidation)]) else: Pickle = Arguments.Filename + ".Validation" + str(CrossValidation) cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def setworks(Arguments): ''' Default implementation for building the MOCA Boolean set networks (setworks). ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset( Arguments) Trials = int(Arguments.Optimization[0]) RepopulateFrequency = int(Arguments.Optimization[1]) PercentToRepopulate = float(Arguments.Optimization[2]) UnionFeatures = int(Arguments.BooleanSets[0]) IntersectionFeatures = int(Arguments.BooleanSets[1]) DifferenceFeatures = int(Arguments.BooleanSets[2]) #MultiProcessMode support if called. Each Phenotype gets its own node Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \ get_setworks(Arguments, \ Features, Variates, Phenotype, \ Markers, Markers, Markers, \ Trials, RepopulateFrequency, PercentToRepopulate, \ UnionFeatures, IntersectionFeatures, DifferenceFeatures) #We only need the intersection of unique setworks passing the FDR threshold QValues = p_adjust(PValues, Arguments.CorrectionMethod) QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \ if QValues[PValue] < Arguments.FDR]) Barcodes = list( set.intersection(set(Setworks.keys()), set(QValues.keys()))) #finally, if we desire we can filter by performance at this stage. We could do it later, but we'll get a bigger Pickle now. Barcodes = [ Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments) ] if Arguments.PermutePhenotype: try: QValue = min(QValues.values()) print "Permutation test failed: you ran with 'PermutePhenotype = True' and setworks could be generated that passed your filters!!!", print "This means that your current FDR cutoff is not sufficient for this data. The minimum FDR observed during", print "this permutation test was " + str( QValue ) + ". You should do this a minimum of 10 times and set your FDR", print "threshold (i.e., 'FDR = threshold' in your Arguments file) AT LEAST one order of magnitude lower than the", print "lowest observed during permutation testing. This conservative threshold will help ensure that results", print "observed during your 'real' setworks run are statisically reliable. The setworks that passed your filters", print "for this permutation testing have been saved; if you care to see what features made it thru you can use", print "the standard 'Mode = PostProcess' to veiw them. Exiting..." except ValueError: print "You ran with 'PermutePhenotype = True' and no setworks could be generated that passed your filters --", print "this is a great start! You should do this a minimum of 10 times and set your FDR threshold (i.e., 'FDR = threshold'", print "in your Arguments file) AT LEAST one order of magnitude lower than the lowest observed during permutation testing.", print "This conservative threshold will help ensure that results observed during your 'real' setworks run are statisically", print "reliable. Exiting..." exit() if len(Barcodes): Results = {} Results["PValues"] = dict([(Barcode, PValues[Barcode]) for Barcode in Barcodes]) Results["QValues"] = dict([(Barcode, QValues[Barcode]) for Barcode in Barcodes]) Results["Performances"] = dict([(Barcode, Performances[Barcode]) for Barcode in Barcodes]) Results["Interactions"] = dict([(Barcode, Interactions[Barcode]) for Barcode in Barcodes]) Results["FeatureVectors"] = dict([ (Barcode, FeatureVectors[Barcode]) for Barcode in Barcodes ]) Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0]) for Barcode in Barcodes]) Results["IntersectionFeatures"] = dict([ (Barcode, Setworks[Barcode][1]) for Barcode in Barcodes ]) Results["DifferenceFeatures"] = dict([ (Barcode, Setworks[Barcode][2]) for Barcode in Barcodes ]) Results["SampleCounts"] = dict([(Barcode, SampleCounts[Barcode]) for Barcode in Barcodes]) Results["CaseCounts"] = dict([(Barcode, CaseCounts[Barcode]) for Barcode in Barcodes]) Results["EffectSizes"] = dict([(Barcode, EffectSizes[Barcode]) for Barcode in Barcodes]) Results["Report"] = make_report(Labels, Phenotype, Barcodes, Arguments) Results["Labels"] = Labels Results["Barcodes"] = Barcodes Results["Phenotype"] = Variates[Features.index(Phenotype)] if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference( set([Arguments.Phenotype])) Pickle = "_".join([ "Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization)) ]) else: Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype. index(":")] cPickle.dump( Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) else: print "No setworks were generated. This could mean your data set is not sufficiently powered for deriving setworks", print "or that you set your filters unreasonably strict. Exiting...", return
def cross_validation(Arguments, \ Labels, Features, Variates, \ Markers, Phenotype, CrossValidations, Cases, Controls): ''' Labels, Features, and Variates have the usual meanings. Markers are any data you give MOCA that you didn't tell it is the Phenotype. Phenotype is the thing your trying to predict (i.e., select markers for)--only one Phenotype per LeaveSomeOut run please. Interger numnber of CrossValidations to do (e.g., leave ONE out or TEN-fold cross validation) Cases and Controls are dictionaries specifying which labels go with which CrossValidation. For each cross-validation this pickles a feature matrix of the following format Label1 Label2 Label3..... Phenotype 0 0 1..... (Setwork1 InteractionType) 1 0 1..... (Setwork2 InteractionType) 1 1 0..... (Setwork3 InteractionType) 0 0 0..... . . . . . ''' #Read in the setwork optimization paraments Trials = int(Arguments.Optimization[0]) RepopulateFrequency = int(Arguments.Optimization[1]) PercentToRepopulate = float(Arguments.Optimization[2]) #Read in the Boolean set parameters UnionFeatures = int(Arguments.BooleanSets[0]) IntersectionFeatures = int(Arguments.BooleanSets[1]) DifferenceFeatures = int(Arguments.BooleanSets[2]) #Multiprocess mode support if called. Cross-validation is so compute intensive that you #can only do this for one Phenotype at a time, and the different cross-validations are #distributed to different processors if MultiProcessMode is called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for CrossValidation in range(CrossValidations)[Node::TotalNodes]: #Split the data is directed TrainLabels = list( set(Labels) - set(Cases[CrossValidation] + Controls[CrossValidation])) TrainVariates = get_ordered_matrix(TrainLabels, Labels, Variates) #Get setworks from the training data PValues, Performances, Interactions, FeatureVectors, Setworks, SampleCounts, CaseCounts, EffectSizes = \ get_setworks(Arguments, \ Features, TrainVariates, Phenotype, \ Markers, Markers, Markers, \ Trials, RepopulateFrequency, PercentToRepopulate, \ UnionFeatures, IntersectionFeatures, DifferenceFeatures) #Make the cross-validation matrix. First row is the case-control-label header CrossValidationFeatureMatrix = [ Cases[CrossValidation] + Controls[CrossValidation] ] PhenotypeVector = [1 for Case in Cases[CrossValidation] ] + [0 for Control in Controls[CrossValidation]] CrossValidationFeatureMatrix.append( PhenotypeVector) #second row is the phenotype vector #We only need the intersection of unique setworks passing the FDR threshold QValues = p_adjust(PValues, Arguments.CorrectionMethod) QValues = dict([(Barcode, QValues[PValue]) for Barcode, PValue in PValues.items() \ if QValues[PValue] < Arguments.FDR]) Barcodes = list( set.intersection(set(Setworks.keys()), set(QValues.keys()))) #finally, if we desire we can filter by fdr at this stage. We could do it later, but we'll get a bigger Pickle now. Barcodes = [ Barcode for Barcode in Barcodes if minimum_performance(Performances[Barcode], Arguments) ] Results = {} Results["PValues"] = dict([(Barcode, PValues[Barcode]) for Barcode in Barcodes]) Results["QValues"] = dict([(Barcode, QValues[Barcode]) for Barcode in Barcodes]) Results["Performances"] = dict([(Barcode, Performances[Barcode]) for Barcode in Barcodes]) Results["Interactions"] = dict([(Barcode, Interactions[Barcode]) for Barcode in Barcodes]) Results["FeatureVectors"] = dict([(Barcode, FeatureVectors[Barcode]) for Barcode in Barcodes]) Results["UnionFeatures"] = dict([(Barcode, Setworks[Barcode][0]) for Barcode in Barcodes]) Results["IntersectionFeatures"] = dict([(Barcode, Setworks[Barcode][1]) for Barcode in Barcodes]) Results["DifferenceFeatures"] = dict([(Barcode, Setworks[Barcode][2]) for Barcode in Barcodes]) Results["Report"] = make_report(Cases, Controls, Phenotype, CrossValidation, Arguments) Results["Barcodes"] = Barcodes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference( set([Arguments.Phenotype])) Pickle = "_".join([ "Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, "".join(map(str, Arguments.BooleanSets)), "".join(map(str, Arguments.Optimization)), ".Validation" + str(CrossValidation) ]) else: Pickle = Arguments.Filename + ".Validation" + str(CrossValidation) cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def unsupervised(Arguments): ''' Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.). In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for a single datatype. ''' if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Transformed.Features.values())) Variates = list(chain(*Data.Transformed.Variates.values())) if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Transformed.Features[Arguments.Data[0]] Features2 = Data.Transformed.Features[Arguments.Data[1]] PValues = {} Interactions = {} SampleCounts = {} CaseCounts = {} #just the positive class here Performances = {} EffectSizes = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: a,b,c,d = contingency_table(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA) PValue = fisher(a,b,c,d) PValues[tuple([Feature1, Feature2])] = PValue.two_tail Interactions[tuple([Feature1, Feature2])] = interaction(PValue) SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d CaseCounts[tuple([Feature1, Feature2])] = a + c #A placeholder solely to make pairwise post-processing generalizable Performances[tuple([Feature1, Feature2])] = "NA" EffectSizes[tuple([Feature1, Feature2])] = "NA" FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Interactions.pop(Pair, None) SampleCounts.pop(Pair, None) CaseCounts.pop(Pair, None) Performances.pop(Pair, None) EffectSizes.pop(Pair, None) Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["Performances"] = Performances Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": Pickle = "_".join(["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def pairwise_continuous(Arguments): ''' ''' if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Features.values())) Variates = list(chain(*Data.Variates.values())) if Arguments.Phenotype: Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature] Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature] else: if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Features[Arguments.Data[0]] Features2 = Data.Features[Arguments.Data[1]] PValues = {} Correlations = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: PValues[tuple([Feature1, Feature2])] = correlation_pvalue(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]) Correlations[tuple([Feature1, Feature2])] = correlation(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]) FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Correlations.pop(Pair, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype) Results["PValues"] = PValues Results["Correlations"] = Correlations Results["FDRs"] = FDRs if Arguments.Filename.lower() == "default": Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def supervised(Arguments): ''' MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features ability to predict the phenotype. ''' Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) #Clustermode support if called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues = {} Interactions = {} Performances = {} SampleCounts = {} CaseCounts = {} #just the postive class here EffectSizes = {} for Marker in Markers: TP,FP,FN,TN = contingency_table(Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA) PValue = fisher(TP,FP,FN,TN) PValues[Marker] = PValue.two_tail Interaction = interaction(PValue) Interactions[Marker] = Interaction Performances[Marker] = Performance(Interaction, TP,FP,FN,TN) EffectSizes[Marker] = EffectSize(Interaction, TP,FP,FN,TN) SampleCounts[Marker] = TP + FP + FN + TN CaseCounts[Marker] = TP + FN FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Marker in Markers: FDR = FDRs[PValues[Marker]] if FDR < Arguments.FDR: pass else: PValues.pop(Marker, None) Interactions.pop(Marker, None) Performances.pop(Marker, None) SampleCounts.pop(Marker, None) CaseCounts.pop(Marker, None) EffectSizes.pop(Marker, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report(Labels, PValues.keys(), Arguments, Supervised=Phenotype[:Phenotype.index(":")]) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["Performances"] = Performances Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join(["Pairwise", "Phenotype=" + Phenotype[:Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")] cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def unsupervised(Arguments): """ Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.). In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for a single datatype. """ if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Transformed.Features.values())) Variates = list(chain(*Data.Transformed.Variates.values())) if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Transformed.Features[Arguments.Data[0]] Features2 = Data.Transformed.Features[Arguments.Data[1]] PValues = {} Interactions = {} SampleCounts = {} CaseCounts = {} # just the positive class here Performances = {} EffectSizes = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: a, b, c, d = contingency_table( Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA ) PValue = fisher(a, b, c, d) PValues[tuple([Feature1, Feature2])] = PValue.two_tail Interactions[tuple([Feature1, Feature2])] = interaction(PValue) SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d CaseCounts[tuple([Feature1, Feature2])] = a + c # A placeholder solely to make pairwise post-processing generalizable Performances[tuple([Feature1, Feature2])] = "NA" EffectSizes[tuple([Feature1, Feature2])] = "NA" FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Interactions.pop(Pair, None) SampleCounts.pop(Pair, None) CaseCounts.pop(Pair, None) Performances.pop(Pair, None) EffectSizes.pop(Pair, None) Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["Performances"] = Performances Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": Pickle = "_".join( ["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod] ) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def pairwise_continuous(Arguments): """ """ if len(Arguments.Data) > 2: print "Unsupervised pairwise calculations can consider no more that two datatypes at a time." print "If you provide only one datatype, all intra-datatype pairs will be considered. If you" print "provide two datatypes, all inter-datatype comparisons will be made. Please change the" print "'Data = ' field. Exiting..." exit() Data = load_data(Arguments) Features = list(chain(*Data.Features.values())) Variates = list(chain(*Data.Variates.values())) if Arguments.Phenotype: Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature] Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature] else: if len(Arguments.Data) == 1: Features1 = Features Features2 = Features if len(Arguments.Data) == 2: Features1 = Data.Features[Arguments.Data[0]] Features2 = Data.Features[Arguments.Data[1]] PValues = {} Correlations = {} Tested = [] for Feature1 in Features1: Tested.append(Feature1) for Feature2 in Features2: if Feature2 not in Tested: PValues[tuple([Feature1, Feature2])] = correlation_pvalue( Variates[Features.index(Feature1)], Variates[Features.index(Feature2)] ) Correlations[tuple([Feature1, Feature2])] = correlation( Variates[Features.index(Feature1)], Variates[Features.index(Feature2)] ) FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Pair, PValue in PValues.items(): if FDRs[PValue] < Arguments.FDR: pass else: PValues.pop(Pair, None) Correlations.pop(Pair, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype) Results["PValues"] = PValues Results["Correlations"] = Correlations Results["FDRs"] = FDRs if Arguments.Filename.lower() == "default": Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod]) else: Pickle = Arguments.Filename cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return
def supervised(Arguments): """ MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically supervised 'learning', as there is no optimization (every possible pairwise comparison is tested). Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features ability to predict the phenotype. """ Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments) # Clustermode support if called Node = int(Arguments.MultiProcessMode[0]) TotalNodes = int(Arguments.MultiProcessMode[1]) for Phenotype in Phenotypes[Node::TotalNodes]: PValues = {} Interactions = {} Performances = {} SampleCounts = {} CaseCounts = {} # just the postive class here EffectSizes = {} for Marker in Markers: TP, FP, FN, TN = contingency_table( Variates[Features.index(Marker)], Variates[Features.index(Phenotype)], NA=Arguments.NA ) PValue = fisher(TP, FP, FN, TN) PValues[Marker] = PValue.two_tail Interaction = interaction(PValue) Interactions[Marker] = Interaction Performances[Marker] = Performance(Interaction, TP, FP, FN, TN) EffectSizes[Marker] = EffectSize(Interaction, TP, FP, FN, TN) SampleCounts[Marker] = TP + FP + FN + TN CaseCounts[Marker] = TP + FN FDRs = p_adjust(PValues, Arguments.CorrectionMethod) for Marker in Markers: FDR = FDRs[PValues[Marker]] if FDR < Arguments.FDR: pass else: PValues.pop(Marker, None) Interactions.pop(Marker, None) Performances.pop(Marker, None) SampleCounts.pop(Marker, None) CaseCounts.pop(Marker, None) EffectSizes.pop(Marker, None) if len(PValues.keys()): Results = {} Results["Report"] = make_report( Labels, PValues.keys(), Arguments, Supervised=Phenotype[: Phenotype.index(":")] ) Results["PValues"] = PValues Results["Interactions"] = Interactions Results["Performances"] = Performances Results["FDRs"] = FDRs Results["SampleCounts"] = SampleCounts Results["CaseCounts"] = CaseCounts Results["EffectSizes"] = EffectSizes if Arguments.Filename.lower() == "default": DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype])) Pickle = "_".join( [ "Pairwise", "Phenotype=" + Phenotype[: Phenotype.index(":")], "_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod, ] ) else: Pickle = Arguments.Filename + "_" + Phenotype[: Phenotype.index(":")] cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1) return