def genes_from_sites(sites, rho): "Convert samples sites to data structure ready to pass to class Data." result = [] for (_v_g, _z_g, x_g), rho_g in zip(sites, rho): _tmp = DictOf(list) for x_gi, rho_gi in zip(x_g, rho_g): _tmp[x_gi].append(rho_gi) result.append(list(_tmp.items())) return result
def count_joint_features(data, num_features, num_values, pseudo_count=.35): """ Counts the co-occurence of values of pairs of features in the data. @arg data: Each datum in data is a list of integers. Each element in each list is the value of the corresponding feature or None for missing data. @return: A map from tuples of features (f1, f2) to arrays of counts (c1, c2). """ def initial_counts(key): f1, f2 = key if f1 == f2: return N.zeros(num_values) + pseudo_count else: return N.zeros((num_values, num_values)) + 4. * pseudo_count result = DictOf(initial_counts, take_key_as_arg=True) for datum in data: if num_features != len(datum): raise RuntimeError('Datum has wrong number (%d) of features (should be %d)' % (len(datum), num_features)) for f1, v1 in enumerate(datum): if None != v1: assert v1 < num_values # update marginal prob of f1 feature result[f1,f1][v1] += 1. for f2, v2 in enumerate(datum[:f1]): if None != v2: assert v2 < num_values result[f1,f2][v1,v2] += 1. return result
def __init__(self, options): "Construct." self.options = options "Options for the test harness." # put defaults in if not specified if not len(options.fragments): self.options.fragments = default_fragments if not len(options.backgrounds): self.options.backgrounds = default_backgrounds self.lazy_sequences = DictOf(load_sequences, take_key_as_arg=True) "Reads sequences lazily." self.lazy_sequences_for_hmm = DictOf(self.get_sequence_for_hmm, take_key_as_arg=True) "Converts sequences to HMM format lazily."
def do_analysis(ucsc_promoters): "Analyse the promoters." logging.info('Analysing %d UCSC promoters with %d bases', len(ucsc_promoters), num_bases_in_promoters(ucsc_promoters)) sequence_analyser = analysis.get_sequence_analyser() ucsc_analysis = DictOf(list) for ensembl, promoters in ucsc_promoters.iteritems(): for promoter in promoters: ucsc_analysis[ensembl].append(sequence_analyser(promoter)) return ucsc_analysis
def group_output_files(directory): "Find all glam2 output files in a directory and group them by run." all_files = glob.glob(os.path.join(directory, 'glam2*.out')) basenames = map(os.path.basename, all_files) #logging.info('\n'.join(basenames)) result = DictOf(set) for filename in basenames: fragment, cross_fold_index, seed = interpret_output_filename(filename) #logging.info(fragment, cross_fold_index, seed) result[(fragment, cross_fold_index)].add(seed) return result
def get_promoters(): "Load the promoters into a dict keyed by ensembl gene." logging.info('Loading UCSC promoters') ucsc_promoters = DictOf(list) for fasta_file in get_fasta_files(options.ucsc_use_masked_seqs): for seq in corebio.seq_io.fasta_io.iterseq(open(fasta_file), dna_alphabet): ucsc, ensembl = seq.description.split() ucsc_promoters[ensembl].append(seq) logging.info('Loaded %d promoters', len(ucsc_promoters)) return ucsc_promoters
mouse_promoters = transpro.get_mouse_promoters() for ref_type in [ 'MGI', 'ENTREZGENE', 'REFSEQ', 'UNIGENE', 'ENSMUSG' ]: print ref_type, len(filter(has_ref_fn(ref_type), mouse_promoters)) print map(mgi_id_for, mouse_promoters[:20]) mgi_to_ensembl = get_mgi_to_ensembl_map() ensembl_promoters = DictOf(list) for p in mouse_promoters: ensembl = ensembl_for(p) if ensembl: ensembl_promoters[ensembl].append(p) sequence_analyser = get_sequence_analyser() analysis = DictOf(list) for ensembl, remos in ensembl_promoters.iteritems(): for remo in remos: analysis[ensembl].append(sequence_analyser(remo.sequence)) fasta = open('mouse-promoters.fa', 'w') for ensembl, remos in ensembl_promoters.iteritems(): for i, remo in enumerate(remos):
numpy_seqs = map(seq_to_numpy, sequences) logging.info('Loaded %d sequences', len(sequences)) logging.info('Parsing PSSMs: %s', model_file) pssms = list(parse_models(open(model_file))) logging.info('Building models') models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site) for parsed in pssms ] def nucleotide_dist(): return numpy.zeros(4) + .25 base_dists = DictOf(nucleotide_dist) min_site_length = 20 logging.info('Analysing sequences') for hmm, traits in models: sites = [] for sequence in numpy_seqs: # analyse the sequence for its most likely state sequence LL, states = hmm.viterbi(sequence) # for each site for site_seq, site_states in sites_from_states( states, sequence, traits.background_states): # is it long enough?
# Parse arguments # methods = args logging.info('Methods are: %s', ' '.join(methods)) if not len(methods): raise RuntimeError('No methods specified on command line') # # Set up the test harness # harness = TestHarness(options) # # Merge scores # scores = DictOf( list) # indexed by (method, fragment) or (method, fragment, bg) def add_results(key, results): scores[key] += results scores[key].sort() for method in methods: for fragment in harness.options.fragments: for fold in harness.folds(): dataset = (fragment, fold) results = harness.results(dataset, method) add_results((method, fragment), results) add_results((method, 'Overall'), results) for bg in harness.options.backgrounds:
def get_matrix_by_name(name): for m in T.Matrix.all(): if -1 != m.name.lower().find(name.lower()): return m return None pastaa_matrices = { 'muscle' : [ 'SRF_Q5_01', 'SRF_01', 'SRF_Q5_02', 'SRF_C', 'MTATA_B' ], 'heart' : [ 'MEF2_Q6_01', 'SRF_C', 'RSRFC4_01', 'MTATA_B', 'MEF2_02' ], 'liver' : [ 'HNF4_Q6_01', 'HNF1_01', 'HNF4_01', 'HNF1_Q6', 'HNF1_C', ], 'retina' : [ 'CRX_Q4', 'CHX10_01' ], 'leukocyte' : [ 'NFKAPPAB65_01', 'NFKAPPAB_01', 'NFKB_Q6_01', 'CREL_01', 'ETS_Q6' ] } pssm_map = get_pssm_to_ensembl_map_min_range() pastaa_gene_sets = DictOf(set) for group, names in pastaa_matrices.iteritems(): for name in names: m = get_matrix_by_name(name) if m: ensembl = pssm_map.get(str(m.acc), '<unknown>') if str(m.acc) in pssm_map: pastaa_gene_sets[group].add(ensembl) logging.info( '%20s : %16s = %s (%16s) - %s (%s)', group, name, m.acc, m.name, ensembl,