def fit_sample_noabstr(sequences, argz, random_state=random.random()):
    '''
    graphs -> more graphs
    graphs are pretty mich (NAME,SEQUENCE),()...
    '''

    # fit a sampler
    sequences = list(sequences)
    estimator = estimatorwrapper(
        nu=.5, cv=2, n_jobs=1)  # with .5 it also works for the fewer ones..
    sampler = rna.AbstractSampler(
        radius_list=argz['radius_list'],  #[0, 1, 2],  # war 0,1
        thickness_list=argz['thickness_list'],  #[1],  # war 2
        min_cip_count=argz['mincip_count'],
        min_interface_count=argz['min_interfacecount'],
        preprocessor=rnana.PreProcessor(base_thickness_list=[1],
                                        ignore_inserts=True),
        postprocessor=rna.PostProcessor(),
        estimator=estimator
        # feasibility_checker=feasibility
    )
    sampler.fit(sequences, grammar_n_jobs=1, grammar_batch_size=1)

    # logger.info('graph grammar stats:')
    dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar(
    ).size()
    # logger.info('#instances:%d   #interfaces: %d   #cores: %d   #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts))

    sequences = [b for a, b in sequences]
    sequences = sampler.sample(sequences,
                               n_samples=5,
                               batch_size=1,
                               n_steps=55,
                               n_jobs=1,
                               quick_skip_orig_cip=True,
                               probabilistic_core_choice=False,
                               burnin=6,
                               improving_threshold=0.5,
                               improving_linear_start=0.15,
                               max_size_diff=6,
                               accept_min_similarity=0.55,
                               select_cip_max_tries=30,
                               keep_duplicates=False,
                               include_seed=False,
                               backtrack=2,
                               monitor=False)

    result = []
    for li in sequences:
        result += li
    return [r[1] for r in result]
Exemplo n.º 2
0
def oneclasstest_fraction(fraction=0.1, repeats=2):
    # choosing some graphs,
    # having array to save results

    for i in range(repeats):
        badscores = []
        goodscores = []
        graphs = get_sequences_with_names(size=923)
        graphs, not_used = random_bipartition_iter(
            graphs, fraction, random_state=random.random() * i * 1000)

        estimator = Wrapper(nu=.27, cv=3, n_jobs=-1)
        sampler = rna.AbstractSampler(radius_list=[0, 1],
                                      thickness_list=[2],
                                      min_cip_count=1,
                                      min_interface_count=2,
                                      preprocessor=rna.PreProcessor(
                                          base_thickness_list=[1],
                                          ignore_inserts=True),
                                      postprocessor=rna.PostProcessor(),
                                      estimator=estimator)
        sampler.preprocessor.set_param(sampler.vectorizer)
        graphmanagers = sampler.preprocessor.fit_transform(graphs)
        sampler.estimatorobject.fit(graphmanagers,
                                    vectorizer=sampler.vectorizer,
                                    random_state=sampler.random_state)

        #test
        for graphman in graphmanagers:
            struct = evaltools.dotbracket_to_shape(graphman.structure,
                                                   shapesversion=SHAPEVERSION)
            score = sampler.estimatorobject.score(graphman)
            if struct == "[[][][]]":
                goodscores.append(score)
            else:
                badscores.append(score)

        print "afraction=%f , instances=%f, good=%d , bad=%d" % (
            fraction, fraction * 923, len(goodscores), len(badscores))
        a = numpy.array(badscores)
        print 'bad:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0)

        a = numpy.array(goodscores)
        print 'cgood:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a,
                                                                       axis=0)

        a = numpy.array(goodscores + badscores)
        print 'dbad+good:mean/std ', numpy.mean(a,
                                                axis=0), ' ', numpy.std(a,
                                                                        axis=0)
        print ''
def fit_sample(graphs, random_state=random.random()):
    '''
    graphs -> more graphs
    '''
    graphs = list(graphs)
    estimator = estimatorwrapper(nu=.5, cv=2, n_jobs=-1)
    sampler = rna.AbstractSampler(radius_list=[0, 1],
                                  thickness_list=[2],
                                  min_cip_count=1,
                                  min_interface_count=2,
                                  preprocessor=rna.PreProcessor(
                                      base_thickness_list=[1],
                                      ignore_inserts=True),
                                  postprocessor=rna.PostProcessor(),
                                  estimator=estimator
                                  #feasibility_checker=feasibility
                                  )
    sampler.fit(graphs, grammar_n_jobs=4, grammar_batch_size=1)

    logger.info('graph grammar stats:')
    dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar(
    ).size()
    logger.info(
        '#instances:%d   #interfaces: %d   #cores: %d   #core-interface-pairs: %d'
        % (dataset_size, interface_counts, core_counts, cip_counts))

    graphs = [b for a, b in graphs]

    graphs = sampler.sample(graphs,
                            n_samples=3,
                            batch_size=1,
                            n_steps=50,
                            n_jobs=4,
                            quick_skip_orig_cip=True,
                            probabilistic_core_choice=True,
                            burnin=10,
                            improving_threshold=0.9,
                            improving_linear_start=0.3,
                            max_size_diff=20,
                            accept_min_similarity=0.65,
                            select_cip_max_tries=30,
                            keep_duplicates=False,
                            include_seed=False,
                            backtrack=10,
                            monitor=False)
    result = []
    for graphlist in graphs:
        result += graphlist
    # note that this is a list [('',sequ),..]
    return result
Exemplo n.º 4
0
def fit_sample(graphs, random_state=random.random()):
    '''
    graphs -> more graphs
    arguments are generated above Oo
    '''
    global arguments

    graphs = list(graphs)
    estimator = estimatorwrapper(nu=.5, cv=2, n_jobs=NJOBS)
    sampler = rna.AbstractSampler(
        radius_list=[0, 1],
        thickness_list=[2],
        min_cip_count=arguments['mincipcount'],
        min_interface_count=arguments['mininterfacecount'],
        preprocessor=rna.PreProcessor(base_thickness_list=[1],
                                      ignore_inserts=True),
        postprocessor=rna.PostProcessor(),
        estimator=estimator
        #feasibility_checker=feasibility
    )
    sampler.fit(graphs, grammar_n_jobs=NJOBS, grammar_batch_size=1)
    graphs = [b for a, b in graphs]
    graphs = sampler.sample(graphs,
                            n_samples=arguments['n_samples'],
                            batch_size=1,
                            n_steps=arguments['n_steps'],
                            n_jobs=NJOBS,
                            quick_skip_orig_cip=arguments['quick_skip'],
                            probabilistic_core_choice=arguments['core_choice'],
                            burnin=arguments['burnin'],
                            improving_threshold=arguments['imp_thresh'],
                            improving_linear_start=arguments['imp_lin_start'],
                            max_size_diff=arguments['maxsizediff'],
                            accept_min_similarity=arguments['acc_min_sim'],
                            select_cip_max_tries=30,
                            keep_duplicates=False,
                            include_seed=False,
                            backtrack=2,
                            monitor=False)
    result = []
    for graphlist in graphs:
        result += graphlist
    # note that this is a list [('',sequ),..]
    return result
Exemplo n.º 5
0
    sequences = itertools.islice(fasta_to_sequence("../example/RF00005.fa"),
                                 size)
    return [b for (a, b) in sequences]


def get_sequences_with_names(size=9999):
    sequences = itertools.islice(fasta_to_sequence("../example/RF00005.fa"),
                                 size)
    return sequences


'''
learning a grammar
'''
import graphlearn.abstract_graphs.learned_RNA as learned
import graphlearn.abstract_graphs.RNA as rna
from graphlearn import feasibility
feas = feasibility.FeasibilityChecker(
    checklist=[feasibility.default_check, rna.is_rna])
graphs = get_sequences_with_names(150)
pp = learned.RnaPreProcessor(base_thickness_list=[2],
                             kmeans_clusters=3,
                             structure_mod=False)
sampler = rna.AbstractSampler(radius_list=[0, 1],
                              thickness_list=[1],
                              min_cip_count=2,
                              min_interface_count=2,
                              feasibility_checker=feas,
                              preprocessor=pp)
sampler.fit(graphs, grammar_n_jobs=1, grammar_batch_size=1)