示例#1
0
def score_sequences(sequences, pssms, bg_types, p_binding_sites):
    """
    Take a bunch of PSSMs and score sequence sets with them.
    """
    scores = cookbook.DictOfLists()
    for (dataset, cross_fold_index), pssms_for_fold in pssms.iteritems():
        positive_key = positive_sequences_key(dataset, cross_fold_index)
        positive_sequence_set = sequences[positive_key]
        logging.info('Analysing %d positive sequences (%d bases) for %s-%d',
                     len(positive_sequence_set),
                     sum(len(s) for s in positive_sequence_set), dataset,
                     cross_fold_index)
        for p_binding_site in p_binding_sites:
            scores[positive_key].append(
                score_sequence_set(positive_sequence_set, pssms_for_fold,
                                   p_binding_site))

        for bg_type in bg_types:
            negative_key = negative_sequences_key(dataset, cross_fold_index,
                                                  bg_type)
            negative_sequence_set = sequences[negative_key]
            logging.info(
                'Analysing %d negative sequences (%d bases) for %s-%d-%s',
                len(negative_sequence_set),
                sum(len(s) for s in negative_sequence_set), dataset,
                cross_fold_index, bg_type)
            for p_binding_site in p_binding_sites:
                scores[negative_key].append(
                    score_sequence_set(negative_sequence_set, pssms_for_fold,
                                       p_binding_site))

    return scores
示例#2
0
文件: go.py 项目: pombredanne/biopsy
def get_all_ensembl_go_annotations():
    "@return: A map from ensembl genes to sets of go annotations."
    import biopsy.identifiers.biomart as biomart
    logging.info('Querying Ensembl biomart for all GO annotations')
    result = cookbook.DictOfLists()
    for id_attr, evidence_attr in [
        ('go_biological_process_id', 'go_biological_process_linkage_type'),
        ('go_cellular_component_id', 'go_cellular_component_linkage_type'),
        ('go_molecular_function_id', 'go_molecular_function_linkage_type'),
    ]:
        for row in biomart.quick_query(
            dataset='mmusculus_gene_ensembl',
            attributes=['ensembl_gene_id', id_attr, evidence_attr]
        ):
            if row[2] not in options.go_evidence_codes_to_ignore and row[1]:
                result[row[0]].append(row[1])
    logging.info('Found %d go annotations', sum(len(v) for v in result.values()))
    return result
示例#3
0
文件: go.py 项目: pombredanne/biopsy
def get_ensembl_go_annotations(genes):
    "@return: A map from the given genes to sets of go annotations."
    import biopsy.identifiers.biomart as biomart
    logging.info('Querying Ensembl biomart for GO annotations of %d genes', len(genes))
    result = cookbook.DictOfLists()
    for id_attr, evidence_attr in [
      ('go_biological_process_id', 'go_biological_process_linkage_type'),
      ('go_cellular_component_id', 'go_cellular_component_linkage_type'),
      ('go_molecular_function_id', 'go_molecular_function_linkage_type'),
    ]:
        query = biomart.new_query()
        dataset = biomart.add_dataset(query, 'mmusculus_gene_ensembl')
        biomart.add_attribute(dataset, 'ensembl_gene_id')
        biomart.add_attribute(dataset, id_attr)
        biomart.add_attribute(dataset, evidence_attr)
        filter = biomart.add_filter(dataset, name='ensembl_gene_id', value='')
        for chunk in biomart.split_big_list((str(g) for g in genes), 50):
            #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk))
            filter.set('value', ','.join(chunk))
            for row in biomart.yield_csv_query_results(query):
                if row[2] not in options.go_evidence_codes_to_ignore:
                    result[row[0]].append(row[1])
    logging.info('Found %d go annotations', sum(len(v) for v in result.values()))
    return result
示例#4
0
        f = f_ref.link.entry
        if f_ref.species.startswith('mouse') or (f.gene and is_mouse_gene(f.gene.entry)):
            for r in refs_for_factor(f):
                yield r

def names_for_pssm(m):
    for f_ref in m.factors:
        f = f_ref.link.entry
        yield f.name
        for s in f.synonyms:
            yield s

if '__main__' == __name__:
    import biopsy.transfac as T

    matrix_references = cookbook.DictOfLists()
    matrix_names = cookbook.DictOfLists()

    for m in T.Matrix.all():
        acc = m.acc
        for r in refs_for_pssm(m):
            matrix_references[acc].append(r)
        for n in names_for_pssm(m):
            matrix_names[acc].append(n)

    #for m in T.Site.all():
    #       refs = [r for r in refs_for_pssm(m)]
    #       if len(refs):
    #               references[m] = refs

    print 'Found references for %4d of %4d matrices' % (len(matrix_references), len(T.Matrix.all()))