def test_binaries_zero_cutoff():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    binarycontactfeaturizer = BinaryContactFeaturizer(cutoff=0)
    binaries = binarycontactfeaturizer.transform(trajectories)

    assert binaries[0].shape[1] == 171
    assert np.sum(binaries[0]) == 0
def test_binaries_zero_cutoff():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    binarycontactfeaturizer = BinaryContactFeaturizer(cutoff=0)
    binaries = binarycontactfeaturizer.transform(trajectories)

    assert binaries[0].shape[1] == 171
    assert np.sum(binaries[0]) == 0
def test_binary_to_logistics():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    steepness = np.absolute(10 * np.random.randn())
    center = np.absolute(np.random.randn())
    binarycontactfeaturizer = BinaryContactFeaturizer(cutoff=center)
    binaries = binarycontactfeaturizer.transform(trajectories)
    logisticcontactfeaturizer = LogisticContactFeaturizer(center=center, steepness=steepness)
    logistics = logisticcontactfeaturizer.transform(trajectories)

    # This checks that no distances that are larger than the center are logistically
    # transformed such that they are less than 1/2
    np.testing.assert_array_almost_equal(binaries[0], logistics[0] > 0.5)
def test_binary_to_logistics():
    trajectories = MinimalFsPeptide().get_cached().trajectories
    steepness = np.absolute(10 * np.random.randn())
    center = np.absolute(np.random.randn())
    binarycontactfeaturizer = BinaryContactFeaturizer(cutoff=center)
    binaries = binarycontactfeaturizer.transform(trajectories)
    logisticcontactfeaturizer = LogisticContactFeaturizer(center=center,
                                                          steepness=steepness)
    logistics = logisticcontactfeaturizer.transform(trajectories)

    # This checks that no distances that are larger than the center are logistically
    # transformed such that they are less than 1/2
    np.testing.assert_array_almost_equal(binaries[0], logistics[0] > 0.5)
示例#5
0
def create_equivalent_contact_featurizer(yaml_file, alignment_file,
                                         protein_list=None,
                                         pairs=None,
                                         same_residue=True,
                                         transform=None,
                                         **kwargs):
    """
    Create a equivalent contacts featurizer for a set of proteins
    :param yaml_file: yaml file location
    :param alignment_file: alignment file location
    :param pairs: wanted sequence index positions in the alignment
    You need to just figure out the wanted location for one residue.
    _map_residue_ind_seq_ind function can help with this
    :same residue: True is you would restrict to having the same residue at the same
    sequence position.
    :param kwargs: kwargs for the contact featurizer
    :return: dictionary of contact featurizers. one for each protein
    """
    featurizer_dict={}

    #load alignment file
    yaml_file = load_yaml_file(yaml_file)
    alignment_file = _parse_alignment_file(alignment_file)
    if protein_list is None:
        protein_list = yaml_file["protein_list"]

    if pairs is None:
        #use the max length(probably a horrible idea)
        max_seq_len = max([len(alignment_file[i]) for i in alignment_file.keys()])
        pairs = [i for i in itertools.combinations(range(max_seq_len), 2)]

    for protein in protein_list:
        print(protein)
        #get a list of residues we can keep
        can_keep=[]
        #get mapping and seq
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        alignment_file[protein])
        #for wanted positions in the massive wanted indices list
        inv_map = {v: k for k, v in prt_mapping.items()}

        for position in np.unique(pairs):
            #get the
            #get the possible codes at every position
            possible_codes = set([alignment_file[p][position] for p in alignment_file.keys()])
            #if there is not a missing residue

            if not "-" in possible_codes:
                if same_residue and len(set(possible_codes))!=1:
                    continue
                # get the inverse mapping and add it to the list of can keep
                residue_index = inv_map[position]
                can_keep.append(residue_index)
        #sort it because i dont want random bs issues.
        can_keep = np.sort(can_keep)
        #get its pairs
        actual_pairs = np.array([i for i in itertools.combinations(can_keep, 2) if i in pairs])
        if transform=='logistic':
            featurizer_dict[protein] = LogisticContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform=='binary':
            featurizer_dict[protein] = BinaryContactFeaturizer(contacts=actual_pairs, **kwargs)
        elif transform is None or transform=="none":
            featurizer_dict[protein] = ContactFeaturizer(contacts=actual_pairs, **kwargs)
        else:
            raise ValueError("type needs to be one of logistic, binary, none")
    return featurizer_dict