Пример #1
0
def compare_by_register_shifts(s1, s2):
    s1bp = ss.SecondaryStructure(dbn=s1).base_pairs()
    s2bp = ss.SecondaryStructure(dbn=s2).base_pairs()
    nsharedbp = 0
    for bp1 in s1bp:
        for bp2 in s2bp:
            if (bp1[0] == bp2[0] and bp1[1] == bp2[1]) or (bp1[0] == bp2[0] - 1 and bp1[1] == bp2[1]) or (bp1[0] == bp2[0] + 1 and bp1[1] == bp2[1]) or (bp1[0] == bp2[0] + 1 and bp1[1] == bp2[1] - 1) or (bp1[0] == bp2[0] + 1 and bp1[1] == bp2[1] + 1):
                    nsharedbp += 0.5
    c = 0.8
    return nsharedbp > c * len(s1bp) and nsharedbp > c * len(s2bp)
Пример #2
0
def get_contact_sites(structures, mutpos, nmeas, npos, c_size, restrict_range=None):
    bp_dicts = []
    nstructs = len(structures)
    if restrict_range:
        mutpos_cutoff = [[m + restrict_range[0] if m > 0 else m for m in pos] for pos in mutpos]
    else:
        mutpos_cutoff = mutpos
    for s, struct in enumerate(structures):
        bp_dicts.append(ss.SecondaryStructure(dbn=struct).base_pair_dict())
    contact_sites = {}
    for s in xrange(nstructs):
        contact_sites[s] = zeros([nmeas, npos])
    nstructs = len(structures)
    for j in xrange(nmeas):
        if len(mutpos_cutoff[j]) > 0:
            for m in mutpos_cutoff[j]:
                for s in xrange(nstructs):
                    for k in xrange(-(c_size-1)/2, (c_size-1)/2+1):
                        if m + k >= 0 and m + k < npos:
                            contact_sites[s][j, m + k] = 1
                        if m in bp_dicts[s] and bp_dicts[s][m] + k < npos and bp_dicts[s][m] + k >= 0:
                            contact_sites[s][j, bp_dicts[s][m] + k] = 1
    if restrict_range is not None:
        for s in xrange(nstructs):
            contact_sites[s] = contact_sites[s][:, restrict_range[0]:restrict_range[1]]
    return contact_sites
Пример #3
0
def bpp_matrix_from_structures(structures, weights, weight_err=None, signal_to_noise_cutoff=0, flip=False, symmetric=True):
    npos = len(structures[0])
    bppm = zeros([npos, npos])
    if weight_err is not None:
        bppm_err = zeros([npos, npos])

    for i, s in enumerate(structures):
        for n1, n2 in ss.SecondaryStructure(dbn=s).base_pairs():
            if flip:
                ntmp = n1
                n2 = n1
                n1 = ntmp
            bppm[n1, n2] += weights[i]
            if symmetric:
                bppm[n2, n1] += weights[i]
            if weight_err is not None:
                bppm_err[n1, n2] += weight_err[i]**2
                if symmetric:
                    bppm_err[n2, n1] += weight_err[i]**2

    if weight_err is not None:
        bppm_err = sqrt(bppm_err)
        for i in xrange(bppm.shape[0]):
            for j in xrange(bppm.shape[1]):
                if bppm[i, j] != 0 and bppm_err[i, j] != 0 and (bppm[i, j] / bppm_err[i, j] < signal_to_noise_cutoff):
                    bppm[i, j] = 0
        return bppm, bppm_err
    else:
        return bppm
Пример #4
0
        def efn_fun(seq):
            print 'Calculating structure energies for sequence %s' % seq
            energy = zeros([1, len(structures)])
            energy[0, :] = array(ss.get_structure_energies(seq, [ss.SecondaryStructure(dbn=remove_non_cannonical(s, seq)) for s in structures], algorithm=algorithm))
#            minenergy = energy[1, :].min()
            for i in xrange(len(structures)):
                energy[0, i] = min(energy[0, i], 200)
            return energy
Пример #5
0
def remove_non_cannonical(structure, sequence):
    cannonical_bp = [('G', 'C'), ('C', 'G'), ('G', 'U'), ('U', 'G'), ('A', 'U'), ('U', 'A')]
    bp_dict = ss.SecondaryStructure(dbn=structure).base_pair_dict()
    res_struct = ['.']*len(sequence)
    for n1, n2 in bp_dict.iteritems():
        if (sequence[n1], sequence[n2]) in cannonical_bp:
            if n1 < n2:
                res_struct[n1] = '('
                res_struct[n2] = ')'
            else:
                res_struct[n1] = ')'
                res_struct[n2] = '('
    return ''.join(res_struct)
Пример #6
0
def get_structure_distance_matrix_2(structures, struct_types, njobs):
    nstructs = len(struct_types[0])
    structures_bp = [ss.SecondaryStructure(dbn=struct).base_pairs() for struct in structures]
    D = zeros([nstructs, nstructs])

    def dist_fun(i):
#        print 'Calculating dist for row %s' % i
        dd = zeros([1, nstructs])
        for j in xrange(i+1, nstructs):
            dd[0, j] = bpdist(structures_bp[i], structures_bp[j])
        return dd
    sys.modules[__name__].dist_fun = dist_fun
    res = joblib.Parallel(n_jobs=njobs)(joblib.delayed(dist_fun)(i) for i in xrange(nstructs))
    for k in xrange(nstructs):
        D[k,:] = res[k]
    D = D + D.T - diag(diag(D))
    print 'Dist calculation done'
    return D
Пример #7
0
def get_structure_distance_matrix(structures, struct_types, distance='mutinf', cstart=0, cend=-1):
    nstructs = len(struct_types[0])
    if distance == 'basepair':
        if(cend==-1):
            ce = len(structures[0])
        else:
            ce = cend
        structures_bp = [ss.SecondaryStructure(dbn=struct[cstart:cend]).base_pairs() for struct in structures]
    D = zeros([nstructs, nstructs])
    for i in xrange(nstructs):
        st1 = [s[i] for s in struct_types]
        D[i, i] = 0
        for j in xrange(i+1, nstructs):
            if distance == 'mutinf':
                D[i, j] = _mutinf(st1, [s[j] for s in struct_types], cstart, cend)
            elif distance == 'basepair':
                D[i, j] = bpdist(structures_bp[i], structures_bp[j])
            elif distance == 'acc':
                D[i, j] = _acc(structures[i], structures[j], cstart, cend)
            else:
                raise ValueError('Distance %s not recognized: options are "mutinf" and "basepair"' % distance)
            D[j, i] = D[i, j]
    return D
Пример #8
0
def get_minimal_overlapping_motif_decomposition(structures, bytype=False, offset=0):
    if type(structures[0]) is str:
        struct_objs = [ss.SecondaryStructure(dbn=s) for s in structures]
    else:
        struct_objs = structures

    def get_motif_id(k, ntlist, pos):
        if bytype:
            return '%s_%s' % (k, pos)
        return '%s_%s' % (k, ';'.join([str(x) for x in ntlist]))

    def get_type_and_ntlist(id):
        typ, ntliststr = id.split('_')
        return typ, [int(x) for x in ntliststr.split(';')]

    pos_motif_map = {}

    elems = [s.explode() for s in struct_objs]
    cover_matrix = ones([len(struct_objs), len(struct_objs[0])])
    motif_ids = []
    for i, s1 in enumerate(struct_objs):
        cover_vec = [False] * len(s1)
        for k, v in elems[i].iteritems():
            for ntlist in v:
                for pos in ntlist:
                    cover_vec[pos] = True
        # Single stranded regions that were not covered by a motif
        # are collapsed into a "motif" we call sstrand
        ssprev = -1
        currssmotif = []
        elems[i]['sstrand'] = []
        foundssmotif = False
        for j in xrange(len(s1)):
            if not cover_vec[j]:
                if ssprev == j - 1:
                    currssmotif.append(j)
                else:
                    if ssprev >= 0:
                        elems[i]['sstrand'].append(currssmotif)
                    currssmotif = [j]
                    foundssmotif = True
                ssprev = j
        if foundssmotif:
            elems[i]['sstrand'].append(currssmotif)

        for k, v in elems[i].iteritems():
            for ntlist in v:
                for pos in ntlist:
                    try:
                        m_idx = motif_ids.index(get_motif_id(k, ntlist, pos+offset))
                        if (pos+offset, m_idx) not in pos_motif_map:
                            pos_motif_map[(pos+offset, m_idx)] = [i]
                        else:
                            if i not in pos_motif_map[(pos + offset, m_idx)]:
                                pos_motif_map[(pos + offset, m_idx)].append(i)
                    except ValueError:
                        motif_ids.append(get_motif_id(k, ntlist, pos+offset))
                        pos_motif_map[(pos + offset, len(motif_ids) - 1)] = [i]

    motif_dist = zeros([len(motif_ids), len(motif_ids)])
    if bytype:
        MAX_DIST = 1.
        MIN_DIST = 1e-5
        for i, mid1 in enumerate(motif_ids):
            for j, mid2 in enumerate(motif_ids):
                typ1, ntlist1 = get_type_and_ntlist(mid1)
                typ2, ntlist2 = get_type_and_ntlist(mid2)
                if typ1 == typ2:
                    nposoverlap = 0.
                    for nt1 in ntlist1:
                        if nt1 in ntlist2:
                            nposoverlap += 1.
                    motif_dist[i, j] = max(MIN_DIST, nposoverlap/max(len(ntlist1), len(ntlist2)))
                else:
                    motif_dist[i, j] = MAX_DIST
                motif_dist[j, i] = motif_dist[i, j]


    """
    for k, d in motif_pos_map.iteritems():
        for i, s1 in enumerate(struct_objs):
            for pos in xrange(len(s1)):
                if pos in d and i in d[pos]:
                    pos_motif_map[pos][k].append(i)

    return motif_pos_map, pos_motif_map, motif_ids
    """
    for i in xrange(cover_matrix.shape[0]):
        for j in xrange(cover_matrix.shape[1]):
            for pos, mid in pos_motif_map.keys():
                if pos == j and i in pos_motif_map[(pos, mid)]:
                    cover_matrix[i, j] = 0
                    break

    return pos_motif_map, motif_ids, motif_dist

    """
Пример #9
0
def mock_data(sequences, structures=None, energy_mu=0.5, energy_sigma=0.5, obs_sigma=0.01, paired_sampling=lambda : SHAPE_paired_sample(), unpaired_sampling= lambda : SHAPE_unpaired_sample(), contact_sampling=lambda : SHAPE_contacts_diff_sample(), mutpos=None, c_size=3, return_steps=False, correlate_regions=False):
    if structures is not None:
        print 'Generating mock data'
        print 'Getting "true" free energies (from RNAstructure)'
        true_energies = get_free_energy_matrix(structures, sequences)

        print 'Energies'
        print true_energies

        # Add some noise to the energies
        noise = energy_sigma*randn(true_energies.shape[0], true_energies.shape[1]) + energy_mu
        #noise = rand(true_energies.shape[0], true_energies.shape[1])
        noised_energies = true_energies + noise
        weights_noised = calculate_weights(noised_energies)
        weights = calculate_weights(true_energies)
        print 'Weights'
        print weights
        print 'Noised Weights'
        print weights_noised

        print 'Generating mock reactivities'
        # Mock reactivities
        reacts = zeros([len(structures), len(sequences[0])])
        prev_s = ''
        MIN_REACT = 1e-5
        P_CONTACT = 1
        if correlate_regions:
            for j, s in enumerate(structures):
                for i in xrange(len(sequences[0])):
                    if s[i] != prev_s:
                        if s[i] == '.':
                            curr_val = unpaired_sampling()
                        else:
                            curr_val = max(MIN_REACT, paired_sampling())
                            curr_val = paired_sampling()
                    reacts[j, i] = max(MIN_REACT, curr_val + 0.01 * SHAPE_contacts_diff_sample())
                    #reacts[j, i] = curr_val
                    prev_s = s[i]
        else:
            prevstate = '.'
            for j, s in enumerate(structures):
                for i in xrange(len(sequences[0])):
                    if s[i] == '.':
                        reacts[j, i] = unpaired_sampling()
                        prevstate = '.'
                    else:
                        if prevstate == '.' or (i < len(sequences[0]) - 1 and s[i + 1] == '.'):
                            reacts[j, i] = paired_sampling() * 1.5
                        else:
                            reacts[j, i] = paired_sampling() * 0.3
                        prevstate = '('

        data = dot(weights_noised, reacts)
        data_orig = dot(weights_noised, reacts)
        if mutpos:
            """
            for i, pos in enumerate(mutpos):
                if i >=0:
                    for k in xrange(-c_size/2, c_size/2+1):
                        if pos+ k < data.shape[1] and rand() > P_CONTACT:
                            data[i,pos+ k] = contact_sampling()
            """
            print 'Simulate diagonal and off-diagonal contact sites'
            max_tries = 1000
            def add_local_perturb(reactivity, weight):
                dd = weight*contact_sampling()
                tries = 0
                while reactivity + dd < MIN_REACT or reactivity + dd > 4.5:
                    dd = weight*contact_sampling() * 0.1
                    tries += 1
                    if tries > max_tries:
                        if reactivity + dd > 4.5:
                            return 4.5
                        if reactivity + dd < MIN_REACT:
                            return MIN_REACT - reactivity
                return dd

            bp_dicts = []
            for s, struct in enumerate(structures):
                bp_dicts.append(ss.SecondaryStructure(dbn=struct).base_pair_dict())
            for j in xrange(data.shape[0]):
                for k in xrange(-(c_size - 1) / 2, (c_size - 1) / 2 + 1):
                    for s in xrange(len(structures)):
                        #if weights_noised[j, s] < 0.1:
                        #    continue
                        if type(mutpos[j]) is list:
                            for m in mutpos[j]:
                                if m + k < data.shape[1] and rand() < P_CONTACT:
                                    if structures[s][m + k] == '.':
                                        dd = 0.1*add_local_perturb(data_orig[j, m + k], weights_noised[j, s])
                                    else:
                                        dd = 0.3*add_local_perturb(data_orig[j, m + k], weights_noised[j, s])
                                    if k != 0:
                                        dd *= 0.2
                                    data[j, m + k] += dd
                                if m in bp_dicts[s] and bp_dicts[s][m] + k < data.shape[1] and rand() < P_CONTACT:
                                    data[j, bp_dicts[s][m] + k] += dd

                        else:
                            if mutpos[j] + k < data.shape[1] and rand() < P_CONTACT:
                                if structures[s][mutpos[j]+ k] == '.':
                                    dd = 0.1 * add_local_perturb(data_orig[j, mutpos[j] + k], weights_noised[j, s])
                                else:
                                    dd = 0.3 * add_local_perturb(data_orig[j, mutpos[j] + k], weights_noised[j, s])
                                if k != 0:
                                    dd *= 0.2
                                data[j, mutpos[j] + k] += dd
                            if mutpos[j] in bp_dicts[s] and bp_dicts[s][mutpos[j]] + k < data.shape[1] and rand() < P_CONTACT:
                                data[j, bp_dicts[s][mutpos[j]] + k] += dd


        print 'Adding observational noise'
        data_noised = zeros(data.shape)
        obs_noise_sigmas = []
        params = (0.10524313598815455, 0.034741986764665007)
        for i in xrange(data.shape[1]):
            sigma = rand() * 0.2 + obs_sigma
            sigma = max(stats.distributions.cauchy(loc=params[0], scale=params[1]).rvs(), 0.001) * 0.2
            data_noised[:, i] = data[:, i] + randn(data.shape[0]) * sigma
            obs_noise_sigmas.append(sigma)
        data_noised = data + randn(data.shape[0], data.shape[1]) * obs_sigma
        if return_steps:
            return dot(weights, reacts), data, data_noised, true_energies, weights_noised, reacts, obs_noise_sigmas
        else:
            return data_noised
    else:
        data = zeros(len(sequences), len(sequences[0]))
        for j, seq in enumerate(sequences):
            bppm = ss.partition(sequence)
            unpaired_probs = 1 - bppm.sum(axis=0)
            for i, up in enumerate(unpaired_probs):
                data[j, i] = up*unpaired_sampling() + (1 - up) * paired_sampling()
        data_noised = data + obs_sigma * randn(data.shape[0], data.shape[1])
        return data_noised