def score_sequences(sequences, pssms, bg_types, p_binding_sites): """ Take a bunch of PSSMs and score sequence sets with them. """ scores = cookbook.DictOfLists() for (dataset, cross_fold_index), pssms_for_fold in pssms.iteritems(): positive_key = positive_sequences_key(dataset, cross_fold_index) positive_sequence_set = sequences[positive_key] logging.info('Analysing %d positive sequences (%d bases) for %s-%d', len(positive_sequence_set), sum(len(s) for s in positive_sequence_set), dataset, cross_fold_index) for p_binding_site in p_binding_sites: scores[positive_key].append( score_sequence_set(positive_sequence_set, pssms_for_fold, p_binding_site)) for bg_type in bg_types: negative_key = negative_sequences_key(dataset, cross_fold_index, bg_type) negative_sequence_set = sequences[negative_key] logging.info( 'Analysing %d negative sequences (%d bases) for %s-%d-%s', len(negative_sequence_set), sum(len(s) for s in negative_sequence_set), dataset, cross_fold_index, bg_type) for p_binding_site in p_binding_sites: scores[negative_key].append( score_sequence_set(negative_sequence_set, pssms_for_fold, p_binding_site)) return scores
def get_all_ensembl_go_annotations(): "@return: A map from ensembl genes to sets of go annotations." import biopsy.identifiers.biomart as biomart logging.info('Querying Ensembl biomart for all GO annotations') result = cookbook.DictOfLists() for id_attr, evidence_attr in [ ('go_biological_process_id', 'go_biological_process_linkage_type'), ('go_cellular_component_id', 'go_cellular_component_linkage_type'), ('go_molecular_function_id', 'go_molecular_function_linkage_type'), ]: for row in biomart.quick_query( dataset='mmusculus_gene_ensembl', attributes=['ensembl_gene_id', id_attr, evidence_attr] ): if row[2] not in options.go_evidence_codes_to_ignore and row[1]: result[row[0]].append(row[1]) logging.info('Found %d go annotations', sum(len(v) for v in result.values())) return result
def get_ensembl_go_annotations(genes): "@return: A map from the given genes to sets of go annotations." import biopsy.identifiers.biomart as biomart logging.info('Querying Ensembl biomart for GO annotations of %d genes', len(genes)) result = cookbook.DictOfLists() for id_attr, evidence_attr in [ ('go_biological_process_id', 'go_biological_process_linkage_type'), ('go_cellular_component_id', 'go_cellular_component_linkage_type'), ('go_molecular_function_id', 'go_molecular_function_linkage_type'), ]: query = biomart.new_query() dataset = biomart.add_dataset(query, 'mmusculus_gene_ensembl') biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, id_attr) biomart.add_attribute(dataset, evidence_attr) filter = biomart.add_filter(dataset, name='ensembl_gene_id', value='') for chunk in biomart.split_big_list((str(g) for g in genes), 50): #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk)) filter.set('value', ','.join(chunk)) for row in biomart.yield_csv_query_results(query): if row[2] not in options.go_evidence_codes_to_ignore: result[row[0]].append(row[1]) logging.info('Found %d go annotations', sum(len(v) for v in result.values())) return result
f = f_ref.link.entry if f_ref.species.startswith('mouse') or (f.gene and is_mouse_gene(f.gene.entry)): for r in refs_for_factor(f): yield r def names_for_pssm(m): for f_ref in m.factors: f = f_ref.link.entry yield f.name for s in f.synonyms: yield s if '__main__' == __name__: import biopsy.transfac as T matrix_references = cookbook.DictOfLists() matrix_names = cookbook.DictOfLists() for m in T.Matrix.all(): acc = m.acc for r in refs_for_pssm(m): matrix_references[acc].append(r) for n in names_for_pssm(m): matrix_names[acc].append(n) #for m in T.Site.all(): # refs = [r for r in refs_for_pssm(m)] # if len(refs): # references[m] = refs print 'Found references for %4d of %4d matrices' % (len(matrix_references), len(T.Matrix.all()))