示例#1
0
def genes_from_sites(sites, rho):
    "Convert samples sites to data structure ready to pass to class Data."
    result = []
    for (_v_g, _z_g, x_g), rho_g in zip(sites, rho):
        _tmp = DictOf(list)
        for x_gi, rho_gi in zip(x_g, rho_g):
            _tmp[x_gi].append(rho_gi)
        result.append(list(_tmp.items()))
    return result
示例#2
0
def count_joint_features(data, num_features, num_values, pseudo_count=.35):
    """
    Counts the co-occurence of values of pairs of features in the data.

    @arg data: Each datum in data is a list of integers. Each element in each list is the value
    of the corresponding feature or None for missing data.
    @return: A map from tuples of features (f1, f2) to arrays of counts (c1, c2).
    """

    def initial_counts(key):
        f1, f2 = key
        if f1 == f2:
            return N.zeros(num_values) + pseudo_count
        else:
            return N.zeros((num_values, num_values)) + 4. * pseudo_count

    result = DictOf(initial_counts, take_key_as_arg=True)

    for datum in data:
        if num_features != len(datum):
            raise RuntimeError('Datum has wrong number (%d) of features (should be %d)' % (len(datum), num_features))
        for f1, v1 in enumerate(datum):
            if None != v1:
                assert v1 < num_values
                # update marginal prob of f1 feature
                result[f1,f1][v1] += 1.
                for f2, v2 in enumerate(datum[:f1]):
                    if None != v2:
                        assert v2 < num_values
                        result[f1,f2][v1,v2] += 1.

    return result
示例#3
0
    def __init__(self, options):
        "Construct."

        self.options = options
        "Options for the test harness."

        # put defaults in if not specified
        if not len(options.fragments):
            self.options.fragments = default_fragments
        if not len(options.backgrounds):
            self.options.backgrounds = default_backgrounds

        self.lazy_sequences = DictOf(load_sequences, take_key_as_arg=True)
        "Reads sequences lazily."

        self.lazy_sequences_for_hmm = DictOf(self.get_sequence_for_hmm, take_key_as_arg=True)
        "Converts sequences to HMM format lazily."
示例#4
0
def do_analysis(ucsc_promoters):
    "Analyse the promoters."
    logging.info('Analysing %d UCSC promoters with %d bases',
                 len(ucsc_promoters), num_bases_in_promoters(ucsc_promoters))
    sequence_analyser = analysis.get_sequence_analyser()
    ucsc_analysis = DictOf(list)
    for ensembl, promoters in ucsc_promoters.iteritems():
        for promoter in promoters:
            ucsc_analysis[ensembl].append(sequence_analyser(promoter))
    return ucsc_analysis
示例#5
0
def group_output_files(directory):
    "Find all glam2 output files in a directory and group them by run."
    all_files = glob.glob(os.path.join(directory, 'glam2*.out'))
    basenames = map(os.path.basename, all_files)
    #logging.info('\n'.join(basenames))
    result = DictOf(set)
    for filename in basenames:
        fragment, cross_fold_index, seed = interpret_output_filename(filename)
        #logging.info(fragment, cross_fold_index, seed)
        result[(fragment, cross_fold_index)].add(seed)
    return result
示例#6
0
def get_promoters():
    "Load the promoters into a dict keyed by ensembl gene."
    logging.info('Loading UCSC promoters')
    ucsc_promoters = DictOf(list)
    for fasta_file in get_fasta_files(options.ucsc_use_masked_seqs):
        for seq in corebio.seq_io.fasta_io.iterseq(open(fasta_file),
                                                   dna_alphabet):
            ucsc, ensembl = seq.description.split()
            ucsc_promoters[ensembl].append(seq)
    logging.info('Loaded %d promoters', len(ucsc_promoters))
    return ucsc_promoters
示例#7
0
    mouse_promoters = transpro.get_mouse_promoters()

    for ref_type in [
        'MGI',
        'ENTREZGENE',
        'REFSEQ',
        'UNIGENE',
        'ENSMUSG'
    ]:
        print ref_type, len(filter(has_ref_fn(ref_type), mouse_promoters))

    print map(mgi_id_for, mouse_promoters[:20])

    mgi_to_ensembl = get_mgi_to_ensembl_map()

    ensembl_promoters = DictOf(list)
    for p in mouse_promoters:
        ensembl = ensembl_for(p)
        if ensembl:
            ensembl_promoters[ensembl].append(p)

    sequence_analyser = get_sequence_analyser()
    analysis = DictOf(list)
    for ensembl, remos in ensembl_promoters.iteritems():
        for remo in remos:
            analysis[ensembl].append(sequence_analyser(remo.sequence))


    fasta = open('mouse-promoters.fa', 'w')
    for ensembl, remos in ensembl_promoters.iteritems():
        for i, remo in enumerate(remos):
示例#8
0
    numpy_seqs = map(seq_to_numpy, sequences)
    logging.info('Loaded %d sequences', len(sequences))

    logging.info('Parsing PSSMs: %s', model_file)
    pssms = list(parse_models(open(model_file)))

    logging.info('Building models')
    models = [
        build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site)
        for parsed in pssms
    ]

    def nucleotide_dist():
        return numpy.zeros(4) + .25

    base_dists = DictOf(nucleotide_dist)

    min_site_length = 20
    logging.info('Analysing sequences')
    for hmm, traits in models:
        sites = []
        for sequence in numpy_seqs:

            # analyse the sequence for its most likely state sequence
            LL, states = hmm.viterbi(sequence)

            # for each site
            for site_seq, site_states in sites_from_states(
                    states, sequence, traits.background_states):

                # is it long enough?
示例#9
0
# Parse arguments
#
methods = args
logging.info('Methods are: %s', ' '.join(methods))
if not len(methods):
    raise RuntimeError('No methods specified on command line')

#
# Set up the test harness
#
harness = TestHarness(options)

#
# Merge scores
#
scores = DictOf(
    list)  # indexed by (method, fragment) or (method, fragment, bg)


def add_results(key, results):
    scores[key] += results
    scores[key].sort()


for method in methods:
    for fragment in harness.options.fragments:
        for fold in harness.folds():
            dataset = (fragment, fold)
            results = harness.results(dataset, method)
            add_results((method, fragment), results)
            add_results((method, 'Overall'), results)
            for bg in harness.options.backgrounds:
示例#10
0
def get_matrix_by_name(name):
    for m in T.Matrix.all():
        if -1 != m.name.lower().find(name.lower()):
            return m
    return None

pastaa_matrices = {
    'muscle'    : [ 'SRF_Q5_01', 'SRF_01', 'SRF_Q5_02', 'SRF_C', 'MTATA_B' ],
    'heart'     : [ 'MEF2_Q6_01', 'SRF_C', 'RSRFC4_01', 'MTATA_B', 'MEF2_02' ],
    'liver'     : [ 'HNF4_Q6_01', 'HNF1_01', 'HNF4_01', 'HNF1_Q6', 'HNF1_C', ],
    'retina'    : [ 'CRX_Q4', 'CHX10_01' ],
    'leukocyte' : [ 'NFKAPPAB65_01', 'NFKAPPAB_01', 'NFKB_Q6_01', 'CREL_01', 'ETS_Q6' ]
}

pssm_map = get_pssm_to_ensembl_map_min_range()
pastaa_gene_sets = DictOf(set)

for group, names in pastaa_matrices.iteritems():
    for name in names:
        m = get_matrix_by_name(name)
        if m:
            ensembl = pssm_map.get(str(m.acc), '<unknown>')
            if str(m.acc) in pssm_map:
                pastaa_gene_sets[group].add(ensembl)
            logging.info(
                '%20s : %16s = %s (%16s) - %s (%s)',
                group,
                name,
                m.acc,
                m.name,
                ensembl,