Exemplo n.º 1
0
def euclidean_distance(sites_a, sites_b):
    """Euclidean distance between two sets of sites"""
    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)
    def ed(cola, colb):
        return math.sqrt(sum((cola[l] - colb[l])**2 for l in "ACGT"))
    return sum(ed(cola, colb) for (cola,colb) in zip(ma.pwm(), mb.pwm()))
Exemplo n.º 2
0
def euclidean_distance(sites_a, sites_b):
    """Euclidean distance between two sets of sites"""
    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)

    def ed(cola, colb):
        return math.sqrt(sum((cola[l] - colb[l])**2 for l in "ACGT"))

    return sum(ed(cola, colb) for (cola, colb) in zip(ma.pwm(), mb.pwm()))
Exemplo n.º 3
0
def pearson_correlation_coefficient(sites_a, sites_b):
    """PEarson correlation coefficient"""
    def pcc(cola, colb):
        cola_avg = sum(cola[l] for l in "ACTG") / 4.0
        colb_avg = sum(colb[l] for l in "ACTG") / 4.0
        return (sum(((cola[l]-cola_avg) * (colb[l]-colb_avg)) for l in "ACTG") /
                math.sqrt(sum((cola[l]-cola_avg)**2 for l in "ACTG") *
                          sum((colb[l]-colb_avg)**2 for l in "ACTG")))

    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)
    return sum(pcc(cola, colb) for (cola,colb) in zip(ma.pwm(), mb.pwm()))
Exemplo n.º 4
0
def kullback_leibler_divergence(sites_a, sites_b):
    """Kullback-Leibler divergence between two sets of sites"""
    def safe_log2(x):
        return math.log(x,2) if x != 0 else 0.0

    def kl(cola, colb):
        return (sum(cola[l] * safe_log2(cola[l] / colb[l]) for l in "ACTG") +
                sum(colb[l] * safe_log2(colb[l] / cola[l]) for l in "ACTG")) / 2.0

    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)
    return sum(kl(cola, colb) for (cola,colb) in zip(ma.pwm(), mb.pwm()))
Exemplo n.º 5
0
def pearson_correlation_coefficient(sites_a, sites_b):
    """PEarson correlation coefficient"""
    def pcc(cola, colb):
        cola_avg = sum(cola[l] for l in "ACTG") / 4.0
        colb_avg = sum(colb[l] for l in "ACTG") / 4.0
        return (sum(((cola[l] - cola_avg) * (colb[l] - colb_avg))
                    for l in "ACTG") / math.sqrt(
                        sum((cola[l] - cola_avg)**2 for l in "ACTG") * sum(
                            (colb[l] - colb_avg)**2 for l in "ACTG")))

    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)
    return sum(pcc(cola, colb) for (cola, colb) in zip(ma.pwm(), mb.pwm()))
Exemplo n.º 6
0
def kullback_leibler_divergence(sites_a, sites_b):
    """Kullback-Leibler divergence between two sets of sites"""
    def safe_log2(x):
        return math.log(x, 2) if x != 0 else 0.0

    def kl(cola, colb):
        return (sum(cola[l] * safe_log2(cola[l] / colb[l]) for l in "ACTG") +
                sum(colb[l] * safe_log2(colb[l] / cola[l])
                    for l in "ACTG")) / 2.0

    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)
    return sum(kl(cola, colb) for (cola, colb) in zip(ma.pwm(), mb.pwm()))
Exemplo n.º 7
0
def average_log_likelihood_ratio(sites_a, sites_b):
    """Average Log-likelihood ratio distance"""
    def safe_log2(x):
        return math.log(x,2) if x != 0 else 0.0
    def allr(cola, colb, cnta, cntb):
        return (sum((cnta[l]*safe_log2(colb[l]/0.25) +
                     cntb[l]*safe_log2(cola[l]/0.25))
                    for l in "ACTG") /
                sum(cnta[l] + cntb[l] for l in 'ACTG'))
    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)
    # reformat biopython count matrices
    counts_a = [dict((l, ma.counts[l][i]) for l in "ACTG") for i in xrange(ma.length)]
    counts_b = [dict((l, mb.counts[l][i]) for l in "ACTG") for i in xrange(mb.length)]
    return sum(allr(cola, colb, cnta, cntb)
               for (cola,colb,cnta,cntb) in zip(ma.pwm(), mb.pwm(), counts_a, counts_b))
Exemplo n.º 8
0
def average_log_likelihood_ratio(sites_a, sites_b):
    """Average Log-likelihood ratio distance"""
    def safe_log2(x):
        return math.log(x, 2) if x != 0 else 0.0

    def allr(cola, colb, cnta, cntb):
        return (sum((cnta[l] * safe_log2(colb[l] / 0.25) +
                     cntb[l] * safe_log2(cola[l] / 0.25))
                    for l in "ACTG") / sum(cnta[l] + cntb[l] for l in 'ACTG'))

    ma = bioutils.build_motif(sites_a)
    mb = bioutils.build_motif(sites_b)
    # reformat biopython count matrices
    counts_a = [
        dict((l, ma.counts[l][i]) for l in "ACTG") for i in xrange(ma.length)
    ]
    counts_b = [
        dict((l, mb.counts[l][i]) for l in "ACTG") for i in xrange(mb.length)
    ]
    return sum(
        allr(cola, colb, cnta, cntb)
        for (cola, colb, cnta,
             cntb) in zip(ma.pwm(), mb.pwm(), counts_a, counts_b))
Exemplo n.º 9
0
def export_PSFM(meta_sites, **kwargs):
    """Export Position-Specific-Frequency-Matrix"""
    format = kwargs['format']
    rows = export_base(meta_sites)
    aligned = bioutils.run_lasagna([m[0].site_instance for m in meta_sites])
    motif = bioutils.build_motif(aligned)
    consensus = bioutils.degenerate_consensus(motif)
    
    TF_name= ','.join(set(row['curation__TF__name'] for row in rows))
    sp = ','.join(set('_'.join(row['site_instance__genome__organism'].split()) for row in rows))
    lines = []
    if format == 'JASPAR':
        lines.append('> CollecTF_%s_%s' % (TF_name, sp))
        lines.append('A [ %s ]' % (' '.join(map(str, motif.counts['A']))))
        lines.append('C [ %s ]' % (' '.join(map(str, motif.counts['C']))))
        lines.append('G [ %s ]' % (' '.join(map(str, motif.counts['G']))))
        lines.append('T [ %s ]' % (' '.join(map(str, motif.counts['T']))))
    elif format == 'TRANSFAC':
        lines.append('ID %s' % TF_name)
        lines.append('BF %s' % sp)
        lines.append('PO\tA\tC\tG\tT')
        lines.extend('%02d\t%d\t%d\t%d\t%d\t%s' % (po+1, motif.counts['A'][po],
                                                   motif.counts['C'][po],
                                                   motif.counts['G'][po],
                                                   motif.counts['T'][po],
                                                   consensus[po])
                     for po in range(motif.length))
        lines.append('XX')
    elif format == 'raw_fasta':
        lines.append('>CollecTF_%s_%s' % (TF_name, sp))
        lines.extend('%d\t%d\t%d\t%d' % (motif.counts['A'][po],
                                         motif.counts['C'][po],
                                         motif.counts['G'][po],
                                         motif.counts['T'][po])
                     for po in range(motif.length))
        
    return '\n'.join(lines)
Exemplo n.º 10
0
def export_PSFM(meta_sites, **kwargs):
    """Export Position-Specific-Frequency-Matrix"""
    format = kwargs['format']
    rows = export_base(meta_sites)
    aligned = bioutils.run_lasagna([m[0].site_instance for m in meta_sites])
    motif = bioutils.build_motif(aligned)
    consensus = bioutils.degenerate_consensus(motif)

    TF_name = ','.join(set(row['curation__TF__name'] for row in rows))
    sp = ','.join(
        set('_'.join(row['site_instance__genome__organism'].split())
            for row in rows))
    lines = []
    if format == 'JASPAR':
        lines.append('> CollecTF_%s_%s' % (TF_name, sp))
        lines.append('A [ %s ]' % (' '.join(map(str, motif.counts['A']))))
        lines.append('C [ %s ]' % (' '.join(map(str, motif.counts['C']))))
        lines.append('G [ %s ]' % (' '.join(map(str, motif.counts['G']))))
        lines.append('T [ %s ]' % (' '.join(map(str, motif.counts['T']))))
    elif format == 'TRANSFAC':
        lines.append('ID %s' % TF_name)
        lines.append('BF %s' % sp)
        lines.append('PO\tA\tC\tG\tT')
        lines.extend(
            '%02d\t%d\t%d\t%d\t%d\t%s' %
            (po + 1, motif.counts['A'][po], motif.counts['C'][po],
             motif.counts['G'][po], motif.counts['T'][po], consensus[po])
            for po in range(motif.length))
        lines.append('XX')
    elif format == 'raw_fasta':
        lines.append('>CollecTF_%s_%s' % (TF_name, sp))
        lines.extend('%d\t%d\t%d\t%d' %
                     (motif.counts['A'][po], motif.counts['C'][po],
                      motif.counts['G'][po], motif.counts['T'][po])
                     for po in range(motif.length))

    return '\n'.join(lines)