def make_pose_input(Arguments): ''' Gather all the data required as input for (e)POSE derivation. ''' Mutations = get_mutations( Arguments ) #Dictionary of amino acid substitutions and binary pheontypes or endophenotypes Sequences = get_sequences( Arguments) #Load the fasta formatted sequence file ReferenceGene = get_reference_gene(Arguments) #Gene your scoring Identities = get_identities( ReferenceGene, Sequences, Arguments) #%ID of all sequences relative to ref #Inititialize burial, and populate if called ResidueBurial = {} if Arguments.Structure: ResidueBurial = normalized_residue_burial(Arguments) #Inititialize annotate, and populate if called Annotation = {} if Arguments.Annotation: Annotation = get_annotation(Arguments) return Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation
def leave_some_out(Arguments): ''' Divide up the mutations before sending off to cross validation. ''' Mutations = get_mutations(Arguments) CrossValidations = int( ceil(len(Mutations.keys()) / float(Arguments.LeaveSomeOut))) #For a standard POSE, the trick is keeping an as-balanced-as-possible spliting of the classes for each data split if Arguments.Mode == "POSE": Positive = [ Mutation for Mutation, Phenotype in Mutations.items() if Phenotype ] Negative = [ Mutation for Mutation, Phenotype in Mutations.items() if not Phenotype ] shuffle( Positive ) #Get rid of bias that MIGHT be inherent in the original order of the mutations #Split as evenly as possible among the cross-validations Positive = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \ for CrossValidation, Mutation in enumerate(range(CrossValidations))]) shuffle( Negative ) #Get rid of bias that MIGHT be inherent in the original order of the mutations #Split as evenly as possible among the cross-validations Negative = dict([(CrossValidation, Negative[Mutation:len(Negative):CrossValidations]) \ for CrossValidation, Mutation in enumerate(range(CrossValidations))]) #only for the leave ONE out case do we do cases and controls in series if Arguments.LeaveSomeOut == 1: if len(Positive) < len(Negative): Postive = dict( zip(Positive.keys(), list(reversed(Positive.values())))) else: Negative = dict( zip(Negative.keys(), list(reversed(Negative.values())))) HoldoutMutations = dict([(CrossValidation, Positive[CrossValidation] + Negative[CrossValidation]) \ for CrossValidation in Positive.keys()]) #For ePOSEs it is simpler. Everyone is endophenotype positive. We therefore only have one class. if Arguments.Mode == "ePOSE": Positive = Mutations.keys() shuffle( Positive ) #Get rid of bias that MIGHT be inherent in the original order of the mutations HoldoutMutations = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \ for CrossValidation, Mutation in enumerate(range(CrossValidations))]) cross_validation(HoldoutMutations, Arguments) return
def leave_some_out(Arguments): ''' Divide up the mutations before sending off to cross validation. ''' Mutations = get_mutations(Arguments) CrossValidations = int(ceil(len(Mutations.keys())/float(Arguments.LeaveSomeOut))) #For a standard POSE, the trick is keeping an as-balanced-as-possible spliting of the classes for each data split if Arguments.Mode == "POSE": Positive = [Mutation for Mutation, Phenotype in Mutations.items() if Phenotype] Negative = [Mutation for Mutation, Phenotype in Mutations.items() if not Phenotype] shuffle(Positive) #Get rid of bias that MIGHT be inherent in the original order of the mutations #Split as evenly as possible among the cross-validations Positive = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \ for CrossValidation, Mutation in enumerate(range(CrossValidations))]) shuffle(Negative) #Get rid of bias that MIGHT be inherent in the original order of the mutations #Split as evenly as possible among the cross-validations Negative = dict([(CrossValidation, Negative[Mutation:len(Negative):CrossValidations]) \ for CrossValidation, Mutation in enumerate(range(CrossValidations))]) #only for the leave ONE out case do we do cases and controls in series if Arguments.LeaveSomeOut == 1: if len(Positive) < len(Negative): Postive = dict(zip(Positive.keys(), list(reversed(Positive.values())))) else: Negative = dict(zip(Negative.keys(), list(reversed(Negative.values())))) HoldoutMutations = dict([(CrossValidation, Positive[CrossValidation] + Negative[CrossValidation]) \ for CrossValidation in Positive.keys()]) #For ePOSEs it is simpler. Everyone is endophenotype positive. We therefore only have one class. if Arguments.Mode == "ePOSE": Positive = Mutations.keys() shuffle(Positive) #Get rid of bias that MIGHT be inherent in the original order of the mutations HoldoutMutations = dict([(CrossValidation, Positive[Mutation:len(Positive):CrossValidations]) \ for CrossValidation, Mutation in enumerate(range(CrossValidations))]) cross_validation(HoldoutMutations, Arguments) return
def make_pose_input(Arguments): ''' Gather all the data required as input for (e)POSE derivation. ''' Mutations = get_mutations(Arguments) #Dictionary of amino acid substitutions and binary pheontypes or endophenotypes Sequences = get_sequences(Arguments) #Load the fasta formatted sequence file ReferenceGene = get_reference_gene(Arguments) #Gene your scoring Identities = get_identities(ReferenceGene, Sequences, Arguments) #%ID of all sequences relative to ref #Inititialize burial, and populate if called ResidueBurial = {} if Arguments.Structure: ResidueBurial = normalized_residue_burial(Arguments) #Inititialize annotate, and populate if called Annotation = {} if Arguments.Annotation: Annotation = get_annotation(Arguments) return Mutations, Sequences, ReferenceGene, Identities, ResidueBurial, Annotation