def test_logistics(): trajectories = MinimalFsPeptide().get_cached().trajectories logisticcontactfeaturizer = LogisticContactFeaturizer() logistics = logisticcontactfeaturizer.transform(trajectories) assert logistics[0].shape[1] == 171 assert np.amax(logistics[0]) < 1.0 assert np.amin(logistics[0]) > 0.0
def test_binary_to_logistics(): trajectories = MinimalFsPeptide().get_cached().trajectories steepness = np.absolute(10 * np.random.randn()) center = np.absolute(np.random.randn()) binarycontactfeaturizer = BinaryContactFeaturizer(cutoff=center) binaries = binarycontactfeaturizer.transform(trajectories) logisticcontactfeaturizer = LogisticContactFeaturizer(center=center, steepness=steepness) logistics = logisticcontactfeaturizer.transform(trajectories) # This checks that no distances that are larger than the center are logistically # transformed such that they are less than 1/2 np.testing.assert_array_almost_equal(binaries[0], logistics[0] > 0.5)
def test_distance_to_logistic(): trajectories = MinimalFsPeptide().get_cached().trajectories steepness = np.absolute(10 * np.random.randn()) center = np.absolute(np.random.randn()) contactfeaturizer = ContactFeaturizer() contacts = contactfeaturizer.transform(trajectories) logisticcontactfeaturizer = LogisticContactFeaturizer(center=center, steepness=steepness) logistics = logisticcontactfeaturizer.transform(trajectories) for n in range(10): i = np.random.randint(0, contacts[0].shape[0] - 1) j = np.random.randint(0, contacts[0].shape[1] - 1) x = contacts[0][i][j] y = logistics[0][i][j] if x > center: assert y < 0.5 if x < center: assert y > 0.5
def test_distance_to_logistic(): trajectories = MinimalFsPeptide().get_cached().trajectories steepness = np.absolute(10 * np.random.randn()) center = np.absolute(np.random.randn()) contactfeaturizer = ContactFeaturizer() contacts = contactfeaturizer.transform(trajectories) logisticcontactfeaturizer = LogisticContactFeaturizer(center=center, steepness=steepness) logistics = logisticcontactfeaturizer.transform(trajectories) for n in range(10): i = np.random.randint(0, contacts[0].shape[0] - 1) j = np.random.randint(0, contacts[0].shape[1] - 1) x = contacts[0][i][j] y = logistics[0][i][j] if (x > center): assert y < 0.5 if (x < center): assert y > 0.5
def create_equivalent_contact_featurizer(yaml_file, alignment_file, protein_list=None, pairs=None, same_residue=True, transform=None, **kwargs): """ Create a equivalent contacts featurizer for a set of proteins :param yaml_file: yaml file location :param alignment_file: alignment file location :param pairs: wanted sequence index positions in the alignment You need to just figure out the wanted location for one residue. _map_residue_ind_seq_ind function can help with this :same residue: True is you would restrict to having the same residue at the same sequence position. :param kwargs: kwargs for the contact featurizer :return: dictionary of contact featurizers. one for each protein """ featurizer_dict={} #load alignment file yaml_file = load_yaml_file(yaml_file) alignment_file = _parse_alignment_file(alignment_file) if protein_list is None: protein_list = yaml_file["protein_list"] if pairs is None: #use the max length(probably a horrible idea) max_seq_len = max([len(alignment_file[i]) for i in alignment_file.keys()]) pairs = [i for i in itertools.combinations(range(max_seq_len), 2)] for protein in protein_list: print(protein) #get a list of residues we can keep can_keep=[] #get mapping and seq prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein, alignment_file[protein]) #for wanted positions in the massive wanted indices list inv_map = {v: k for k, v in prt_mapping.items()} for position in np.unique(pairs): #get the #get the possible codes at every position possible_codes = set([alignment_file[p][position] for p in alignment_file.keys()]) #if there is not a missing residue if not "-" in possible_codes: if same_residue and len(set(possible_codes))!=1: continue # get the inverse mapping and add it to the list of can keep residue_index = inv_map[position] can_keep.append(residue_index) #sort it because i dont want random bs issues. can_keep = np.sort(can_keep) #get its pairs actual_pairs = np.array([i for i in itertools.combinations(can_keep, 2) if i in pairs]) if transform=='logistic': featurizer_dict[protein] = LogisticContactFeaturizer(contacts=actual_pairs, **kwargs) elif transform=='binary': featurizer_dict[protein] = BinaryContactFeaturizer(contacts=actual_pairs, **kwargs) elif transform is None or transform=="none": featurizer_dict[protein] = ContactFeaturizer(contacts=actual_pairs, **kwargs) else: raise ValueError("type needs to be one of logistic, binary, none") return featurizer_dict