def fit_sample_noabstr(sequences, argz, random_state=random.random()):
    '''
    graphs -> more graphs
    graphs are pretty mich (NAME,SEQUENCE),()...
    '''

    # fit a sampler
    sequences = list(sequences)
    estimator = estimatorwrapper(
        nu=.5, cv=2, n_jobs=1)  # with .5 it also works for the fewer ones..
    sampler = rna.AbstractSampler(
        radius_list=argz['radius_list'],  #[0, 1, 2],  # war 0,1
        thickness_list=argz['thickness_list'],  #[1],  # war 2
        min_cip_count=argz['mincip_count'],
        min_interface_count=argz['min_interfacecount'],
        preprocessor=rnana.PreProcessor(base_thickness_list=[1],
                                        ignore_inserts=True),
        postprocessor=rna.PostProcessor(),
        estimator=estimator
        # feasibility_checker=feasibility
    )
    sampler.fit(sequences, grammar_n_jobs=1, grammar_batch_size=1)

    # logger.info('graph grammar stats:')
    dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar(
    ).size()
    # logger.info('#instances:%d   #interfaces: %d   #cores: %d   #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts))

    sequences = [b for a, b in sequences]
    sequences = sampler.sample(sequences,
                               n_samples=5,
                               batch_size=1,
                               n_steps=55,
                               n_jobs=1,
                               quick_skip_orig_cip=True,
                               probabilistic_core_choice=False,
                               burnin=6,
                               improving_threshold=0.5,
                               improving_linear_start=0.15,
                               max_size_diff=6,
                               accept_min_similarity=0.55,
                               select_cip_max_tries=30,
                               keep_duplicates=False,
                               include_seed=False,
                               backtrack=2,
                               monitor=False)

    result = []
    for li in sequences:
        result += li
    return [r[1] for r in result]
예제 #2
0
def oneclasstest_fraction(fraction=0.1, repeats=2):
    # choosing some graphs,
    # having array to save results

    for i in range(repeats):
        badscores = []
        goodscores = []
        graphs = get_sequences_with_names(size=923)
        graphs, not_used = random_bipartition_iter(
            graphs, fraction, random_state=random.random() * i * 1000)

        estimator = Wrapper(nu=.27, cv=3, n_jobs=-1)
        sampler = rna.AbstractSampler(radius_list=[0, 1],
                                      thickness_list=[2],
                                      min_cip_count=1,
                                      min_interface_count=2,
                                      preprocessor=rna.PreProcessor(
                                          base_thickness_list=[1],
                                          ignore_inserts=True),
                                      postprocessor=rna.PostProcessor(),
                                      estimator=estimator)
        sampler.preprocessor.set_param(sampler.vectorizer)
        graphmanagers = sampler.preprocessor.fit_transform(graphs)
        sampler.estimatorobject.fit(graphmanagers,
                                    vectorizer=sampler.vectorizer,
                                    random_state=sampler.random_state)

        #test
        for graphman in graphmanagers:
            struct = evaltools.dotbracket_to_shape(graphman.structure,
                                                   shapesversion=SHAPEVERSION)
            score = sampler.estimatorobject.score(graphman)
            if struct == "[[][][]]":
                goodscores.append(score)
            else:
                badscores.append(score)

        print "afraction=%f , instances=%f, good=%d , bad=%d" % (
            fraction, fraction * 923, len(goodscores), len(badscores))
        a = numpy.array(badscores)
        print 'bad:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0)

        a = numpy.array(goodscores)
        print 'cgood:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a,
                                                                       axis=0)

        a = numpy.array(goodscores + badscores)
        print 'dbad+good:mean/std ', numpy.mean(a,
                                                axis=0), ' ', numpy.std(a,
                                                                        axis=0)
        print ''
def fit_sample(graphs, random_state=random.random()):
    '''
    graphs -> more graphs
    '''
    graphs = list(graphs)
    estimator = estimatorwrapper(nu=.5, cv=2, n_jobs=-1)
    sampler = rna.AbstractSampler(radius_list=[0, 1],
                                  thickness_list=[2],
                                  min_cip_count=1,
                                  min_interface_count=2,
                                  preprocessor=rna.PreProcessor(
                                      base_thickness_list=[1],
                                      ignore_inserts=True),
                                  postprocessor=rna.PostProcessor(),
                                  estimator=estimator
                                  #feasibility_checker=feasibility
                                  )
    sampler.fit(graphs, grammar_n_jobs=4, grammar_batch_size=1)

    logger.info('graph grammar stats:')
    dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar(
    ).size()
    logger.info(
        '#instances:%d   #interfaces: %d   #cores: %d   #core-interface-pairs: %d'
        % (dataset_size, interface_counts, core_counts, cip_counts))

    graphs = [b for a, b in graphs]

    graphs = sampler.sample(graphs,
                            n_samples=3,
                            batch_size=1,
                            n_steps=50,
                            n_jobs=4,
                            quick_skip_orig_cip=True,
                            probabilistic_core_choice=True,
                            burnin=10,
                            improving_threshold=0.9,
                            improving_linear_start=0.3,
                            max_size_diff=20,
                            accept_min_similarity=0.65,
                            select_cip_max_tries=30,
                            keep_duplicates=False,
                            include_seed=False,
                            backtrack=10,
                            monitor=False)
    result = []
    for graphlist in graphs:
        result += graphlist
    # note that this is a list [('',sequ),..]
    return result
예제 #4
0
def fit_sample(graphs, random_state=random.random()):
    '''
    graphs -> more graphs
    arguments are generated above Oo
    '''
    global arguments

    graphs = list(graphs)
    estimator = estimatorwrapper(nu=.5, cv=2, n_jobs=NJOBS)
    sampler = rna.AbstractSampler(
        radius_list=[0, 1],
        thickness_list=[2],
        min_cip_count=arguments['mincipcount'],
        min_interface_count=arguments['mininterfacecount'],
        preprocessor=rna.PreProcessor(base_thickness_list=[1],
                                      ignore_inserts=True),
        postprocessor=rna.PostProcessor(),
        estimator=estimator
        #feasibility_checker=feasibility
    )
    sampler.fit(graphs, grammar_n_jobs=NJOBS, grammar_batch_size=1)
    graphs = [b for a, b in graphs]
    graphs = sampler.sample(graphs,
                            n_samples=arguments['n_samples'],
                            batch_size=1,
                            n_steps=arguments['n_steps'],
                            n_jobs=NJOBS,
                            quick_skip_orig_cip=arguments['quick_skip'],
                            probabilistic_core_choice=arguments['core_choice'],
                            burnin=arguments['burnin'],
                            improving_threshold=arguments['imp_thresh'],
                            improving_linear_start=arguments['imp_lin_start'],
                            max_size_diff=arguments['maxsizediff'],
                            accept_min_similarity=arguments['acc_min_sim'],
                            select_cip_max_tries=30,
                            keep_duplicates=False,
                            include_seed=False,
                            backtrack=2,
                            monitor=False)
    result = []
    for graphlist in graphs:
        result += graphlist
    # note that this is a list [('',sequ),..]
    return result
예제 #5
0
def fit_sample_infernal(seques, dummy):
    """
    ok wir machen
     write fasta
     muscle,
     alifold,
     biopython,
     create_cm und cmemit oO
    """
    #print seques
    sequences = [b for a, b in seques]
    rna.write_fasta(sequences, "tmp.fa")
    shell_exec('muscle -in tmp.fa -out museld.fa')
    a, b, out = shell_exec('cat museld.fa | RNAalifold -f F --noPS')
    ss = getstr(out)
    to_stockholm('museld.fa', ss, 'sto.sto')
    shell_exec("cmbuild -F mod3l sto.sto")
    shell_exec("cmemit -N %d --exp 3.92  mod3l > out.fa" %
               (len(sequences) * 2))
    return fasta_to_list('out.fa')
예제 #6
0
def eval(repeats,size):
    result=[]
    for i in range(repeats):
        graphs=get_sequences_with_names(size=size, rand=(i+3)*10)
        zz=fit_sample(graphs)
        z=[b for a ,b in zz]
        cmpath='../%s.cm' % RFAM
        result+=rna.infernal_checker(z,cmfile=cmpath, cmsearchbinarypath='../toolsdata/cmsearch')
        
    a = numpy.array(result)
    mean = numpy.mean(a, axis=0)
    std = numpy.std(a, axis=0)
    
    print 'size:%d mean:%f std:%f' % (size,mean,std)
    return mean,std
예제 #7
0
def eval(repeats, size):
    result = []
    for i in range(repeats):
        graphs = get_sequences_with_names(size=size, rand=(i + 3) * 10)
        zz = fit_sample(graphs)
        z = [b for a, b in zz]
        cmpath = '../%s.cm' % RFAM
        result += rna.infernal_checker(
            z, cmfile=cmpath, cmsearchbinarypath='../toolsdata/cmsearch')

    a = numpy.array(result)
    mean = numpy.mean(a, axis=0)
    std = numpy.std(a, axis=0)

    print 'size:%d mean:%f std:%f' % (size, mean, std)
    return mean, std
예제 #8
0
def evaluate(repeats, size, fitsample, RFAM,inputdict,debug):
    means = []
    stds = []
    for i in range(repeats):
        if debug: print 'start rep'
        sequences,void = utils.get_seq_tups(RFAM+'.fa',size,1)
        zz = fitsample(sequences,inputdict)
        # print zz[:3]
        zz=[b for a ,b in zz]
        result = rna.infernal_checker(zz, cmfile='../toolsdata/%s.cm' % RFAM,
                                      cmsearchbinarypath='../toolsdata/cmsearch')
        a = np.array(result)
        means.append(np.mean(a, axis=0))
        stds.append(np.std(a, axis=0))
    means.sort()
    stds.sort()
    #print (size, means, stds)
    return means[repeats / 2] * 100, stds[repeats / 2] * 100
def evaluate(repeats, size, fitsample):
    print 'eval:',
    means = []
    stds = []
    for i in range(repeats):
        sequences = get_sequences_with_names(size=size, rand=10)
        zz = fitsample(sequences)
        # print zz[:3]
        # z=[b for a ,b in zz]
        result = rna.infernal_checker(
            zz,
            cmfile='../toolsdata/%s.cm' % RFAM,
            cmsearchbinarypath='../toolsdata/cmsearch')

        a = np.array(result)
        means.append(np.mean(a, axis=0))
        stds.append(np.std(a, axis=0))

    means.sort()
    stds.sort()
    print(size, means, stds)
    return [means[repeats / 2] * 100, stds[repeats / 2] * 100]
예제 #10
0
    sequences = itertools.islice(fasta_to_sequence("../example/RF00005.fa"),
                                 size)
    return [b for (a, b) in sequences]


def get_sequences_with_names(size=9999):
    sequences = itertools.islice(fasta_to_sequence("../example/RF00005.fa"),
                                 size)
    return sequences


'''
learning a grammar
'''
import graphlearn.abstract_graphs.learned_RNA as learned
import graphlearn.abstract_graphs.RNA as rna
from graphlearn import feasibility
feas = feasibility.FeasibilityChecker(
    checklist=[feasibility.default_check, rna.is_rna])
graphs = get_sequences_with_names(150)
pp = learned.RnaPreProcessor(base_thickness_list=[2],
                             kmeans_clusters=3,
                             structure_mod=False)
sampler = rna.AbstractSampler(radius_list=[0, 1],
                              thickness_list=[1],
                              min_cip_count=2,
                              min_interface_count=2,
                              feasibility_checker=feas,
                              preprocessor=pp)
sampler.fit(graphs, grammar_n_jobs=1, grammar_batch_size=1)