Exemplo n.º 1
0
def get_sequences_with_names(size=9999, rand=0):
    if rand>0:
        sequences , boring = random_bipartition_iter(fasta_to_sequence("../toolsdata/%s.fa" % RFAM),.9,random_state=random.random()*rand)
        sequences = itertools.islice( sequences , size)
    else:
        sequences = itertools.islice( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size)
    return sequences
Exemplo n.º 2
0
def batch_performance_evaluation(params,
                                 synthesizer=None,
                                 iter_train=None,
                                 iter_test=None,
                                 relative_size=None):
    """
    """
    n_experiment_repetitions = params['n_experiment_repetitions']

    start_time = time.time()

    e_roc_t = []
    e_apr_t = []
    e_roc_s = []
    e_apr_s = []

    for epoch in range(n_experiment_repetitions):
        logger.info('-' * 80)
        logger.info('run %d/%d' % (epoch + 1, n_experiment_repetitions))

        # Copy train and test iterables for one run.
        iter_train, iter_train_ = tee(iter_train)
        iter_test, iter_test_ = tee(iter_test)

        # Portion of train and test iterables used in one run.
        iter_train_, x = random_bipartition_iter(iter_train_,
                                                 relative_size=relative_size)
        iter_test_, x = random_bipartition_iter(iter_test_,
                                                relative_size=relative_size)

        roc_t, apr_t, roc_s, apr_s = performance_evaluation(
            params,
            synthesizer=synthesizer,
            iter_train=iter_train_,
            iter_test=iter_test_)

        # Update experiment performance measures.
        e_roc_t.append(roc_t)
        e_apr_t.append(apr_t)
        e_roc_s.append(roc_s)
        e_apr_s.append(apr_s)

    elapsed_time = time.time() - start_time

    return e_roc_t, e_apr_t, e_roc_s, e_apr_s, elapsed_time
Exemplo n.º 3
0
def split_to_train_and_test(rfam_id=None, train_to_test_split_ratio=None, number_of_samples=None):

    iterable = fasta_to_sequence(rfam_url(rfam_id))
    if number_of_samples:
        iterable = islice(iterable, number_of_samples)
        logger.info('Experiment cunducted with %d sample sequences' %
                    number_of_samples)

    train, test = random_bipartition_iter(
        iterable, relative_size=train_to_test_split_ratio)
    return train, test
Exemplo n.º 4
0
def get_sequences_with_names(size=9999, rand=0):
    if rand > 0:
        sequences, boring = random_bipartition_iter(
            fasta_to_sequence("../toolsdata/%s.fa" % RFAM),
            .9,
            random_state=random.random() * rand)
        sequences = itertools.islice(sequences, size)
    else:
        sequences = itertools.islice(
            fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size)
    return sequences
Exemplo n.º 5
0
def oneclasstest_fraction(fraction=0.1, repeats=2):
    # choosing some graphs,
    # having array to save results

    for i in range(repeats):
        badscores = []
        goodscores = []
        graphs = get_sequences_with_names(size=923)
        graphs, not_used = random_bipartition_iter(
            graphs, fraction, random_state=random.random() * i * 1000)

        estimator = Wrapper(nu=.27, cv=3, n_jobs=-1)
        sampler = rna.AbstractSampler(radius_list=[0, 1],
                                      thickness_list=[2],
                                      min_cip_count=1,
                                      min_interface_count=2,
                                      preprocessor=rna.PreProcessor(
                                          base_thickness_list=[1],
                                          ignore_inserts=True),
                                      postprocessor=rna.PostProcessor(),
                                      estimator=estimator)
        sampler.preprocessor.set_param(sampler.vectorizer)
        graphmanagers = sampler.preprocessor.fit_transform(graphs)
        sampler.estimatorobject.fit(graphmanagers,
                                    vectorizer=sampler.vectorizer,
                                    random_state=sampler.random_state)

        #test
        for graphman in graphmanagers:
            struct = evaltools.dotbracket_to_shape(graphman.structure,
                                                   shapesversion=SHAPEVERSION)
            score = sampler.estimatorobject.score(graphman)
            if struct == "[[][][]]":
                goodscores.append(score)
            else:
                badscores.append(score)

        print "afraction=%f , instances=%f, good=%d , bad=%d" % (
            fraction, fraction * 923, len(goodscores), len(badscores))
        a = numpy.array(badscores)
        print 'bad:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0)

        a = numpy.array(goodscores)
        print 'cgood:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a,
                                                                       axis=0)

        a = numpy.array(goodscores + badscores)
        print 'dbad+good:mean/std ', numpy.mean(a,
                                                axis=0), ' ', numpy.std(a,
                                                                        axis=0)
        print ''
Exemplo n.º 6
0
def batch_performance_evaluation(params, synthesizer=None, iter_train=None, iter_test=None, relative_size=None):
    """
    """
    n_experiment_repetitions = params['n_experiment_repetitions']

    start_time = time.time()

    e_roc_t = []
    e_apr_t = []
    e_roc_s = []
    e_apr_s = []

    for epoch in range(n_experiment_repetitions):
        logger.info('-' * 80)
        logger.info('run %d/%d' % (epoch + 1, n_experiment_repetitions))

        # Copy train and test iterables for one run.
        iter_train, iter_train_ = tee(iter_train)
        iter_test, iter_test_ = tee(iter_test)

        # Portion of train and test iterables used in one run.
        iter_train_, x = random_bipartition_iter(
            iter_train_, relative_size=relative_size)
        iter_test_, x = random_bipartition_iter(
            iter_test_, relative_size=relative_size)

        roc_t, apr_t, roc_s, apr_s = performance_evaluation(
            params, synthesizer=synthesizer, iter_train=iter_train_, iter_test=iter_test_)

        # Update experiment performance measures.
        e_roc_t.append(roc_t)
        e_apr_t.append(apr_t)
        e_roc_s.append(roc_s)
        e_apr_s.append(apr_s)

    elapsed_time = time.time() - start_time

    return e_roc_t, e_apr_t, e_roc_s, e_apr_s, elapsed_time
Exemplo n.º 7
0
def oneclasstest_fraction(fraction=0.1, repeats=2):
    # choosing some graphs,
    # having array to save results

    for i in range(repeats):
        badscores = []
        goodscores = []
        graphs = get_sequences_with_names(size=923)
        graphs, not_used = random_bipartition_iter(graphs, fraction, random_state=random.random() * i * 1000)

        estimator = Wrapper(nu=0.27, cv=3, n_jobs=-1)
        sampler = rna.AbstractSampler(
            radius_list=[0, 1],
            thickness_list=[2],
            min_cip_count=1,
            min_interface_count=2,
            preprocessor=rna.PreProcessor(base_thickness_list=[1], ignore_inserts=True),
            postprocessor=rna.PostProcessor(),
            estimator=estimator,
        )
        sampler.preprocessor.set_param(sampler.vectorizer)
        graphmanagers = sampler.preprocessor.fit_transform(graphs)
        sampler.estimatorobject.fit(graphmanagers, vectorizer=sampler.vectorizer, random_state=sampler.random_state)

        # test
        for graphman in graphmanagers:
            struct = evaltools.dotbracket_to_shape(graphman.structure, shapesversion=SHAPEVERSION)
            score = sampler.estimatorobject.score(graphman)
            if struct == "[[][][]]":
                goodscores.append(score)
            else:
                badscores.append(score)

        print "afraction=%f , instances=%f, good=%d , bad=%d" % (
            fraction,
            fraction * 923,
            len(goodscores),
            len(badscores),
        )
        a = numpy.array(badscores)
        print "bad:mean/std ", numpy.mean(a, axis=0), " ", numpy.std(a, axis=0)

        a = numpy.array(goodscores)
        print "cgood:mean/std ", numpy.mean(a, axis=0), " ", numpy.std(a, axis=0)

        a = numpy.array(goodscores + badscores)
        print "dbad+good:mean/std ", numpy.mean(a, axis=0), " ", numpy.std(a, axis=0)
        print ""
Exemplo n.º 8
0
def evaluate(pos_fname,
             neg_fname=None,
             size=None,
             percentages=None,
             n_repetitions=None,
             train_test_split=None):
    # initializing
    graphs_pos = get_graphs(pos_fname, size=size)

    if neg_fname == None:
        graphs_neg = get_graphs_permuted(pos_fname, size=size)
    else:
        graphs_neg = get_graphs(neg_fname, size=size)

    # train/test split
    from eden.util import random_bipartition_iter
    pos_train_global, pos_test_global = random_bipartition_iter(
        graphs_pos, train_test_split, random_state=random.random() * 1000)
    neg_train_global, neg_test_global = random_bipartition_iter(
        graphs_neg, train_test_split, random_state=random.random() * 1000)

    original_repetitions = []
    original_sample_repetitions = []
    sample_repetitions = []

    for percentage in percentages:
        originals = []
        originals_samples = []
        samples = []
        for repetition in range(n_repetitions):
            random_state = int(313379 * percentage + repetition)
            random.seed(random_state)
            pos_train_global, pos_train_global_ = tee(pos_train_global)
            neg_train_global, neg_train_global_ = tee(neg_train_global)
            pos_test_global, pos_test_global_ = tee(pos_test_global)
            neg_test_global, neg_test_global_ = tee(neg_test_global)

            # use shuffled list to create test and sample set
            pos, pos_reminder = random_bipartition_iter(
                pos_train_global_, percentage)
            pos, pos_ = tee(pos)
            neg, neg_reminder = random_bipartition_iter(
                neg_train_global_, percentage)
            neg, neg_ = tee(neg)

            #sample independently from the 2 classes
            logger.info('Positive')
            sampled_pos = fit_sample(pos_, random_state=random_state)
            logger.info('Negative')
            sampled_neg = fit_sample(neg_, random_state=random_state)

            #evaluate the predictive performance on held out test set
            start = time()
            logger.info("=" * 80)
            logger.info('repetition: %d/%d' % (repetition + 1, n_repetitions))
            logger.info("training percentage:" + str(percentage))
            perf_orig, perf_samp, perf_orig_samp = fit_and_evaluate(
                pos, neg, sampled_pos, sampled_neg, pos_test_global_,
                neg_test_global_)
            logger.info('Time elapsed for full repetition: %.1f sec' %
                        ((time() - start)))
            originals.append(perf_orig)
            originals_samples.append(perf_orig_samp)
            samples.append(perf_samp)

        original_repetitions.append(originals)
        original_sample_repetitions.append(originals_samples)
        sample_repetitions.append(samples)

    return original_repetitions, original_sample_repetitions, sample_repetitions
Exemplo n.º 9
0
def train_dbox_model(fasta_fname=None, model_fname='eden_model_Dbox',window=4, neg_size_factor=5, train_test_split=0.7, n_jobs=4, n_iter=40):
    
    #transform sequences in a linear graph
    def pre_process_graph(iterator):
        from eden.converter.fasta import sequence_to_eden
        graphs = sequence_to_eden(iterator)
        return graphs

    #extract box sequence with annotaded header information
    def extract_box(data,window=3,box_type='D'):
        import re
        from eden.converter.fasta import fasta_to_sequence
        seqs = fasta_to_sequence(data)
        
        for seq in seqs:
            header = seq[0].split('_')
            cbox = header[-4]
            cpos = int(header[-3])
            dbox = header[-2]
            dpos = int(header[-1])
            
            nts = re.sub('\n','',seq[1])
            if box_type == 'C':
                box = nts[cpos-1-window:cpos+6+window]
            else:
                if (not((len(nts) < dpos+3+window) or (dpos-1-window < 0))):
                    #box = nts[dpos-1-window:dpos+3+window]
                    box = nts[dpos-1-window:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window]
                    yield seq[0],box
                    
    #Choose the vectorizer
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()
        
    #Choose the estimator
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True )
    
    import random
    from eden import util
    ################Generate positive samples###############
    seqs_d_pos = extract_box(fasta_fname,window,box_type='D')

    from itertools import tee
    seqs_d_pos,seqs_d_pos_ = tee(seqs_d_pos)
    #################Generate negatives samples##############
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    seqs_d_neg = seq_to_seq( seqs_d_pos_, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
    
    #####################split train/test####################
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(seqs_d_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(seqs_d_neg, relative_size=train_test_split)
    
    iterable_pos_train = list(iterable_pos_train)
    iterable_pos_test = list(iterable_pos_test)
    iterable_neg_train = list(iterable_neg_train)
    iterable_neg_test = list(iterable_neg_test)

    print "training pos ",len(iterable_pos_train)
    print "training neg ",len(iterable_neg_train)
    print "test pos ",len(iterable_pos_test)
    print "test neg ",len(iterable_neg_test)

    
    
    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, 
                                                    estimator=estimator, 
                                                    vectorizer=vectorizer,
                                                    n_jobs=n_jobs)
    #optimize hyperparameters and fit model
    from numpy.random import randint
    from numpy.random import uniform
    
    vectorizer_parameters={'complexity':[2,3]}
    
    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,0)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"],
                          'n_jobs':[n_jobs]}
    
    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   max_total_time=60*30, n_iter=n_iter, 
                   cv=10,
                   score_func=lambda avg_score,std_score : avg_score - std_score * 2,
                   scoring='roc_auc', 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    print model.get_parameters()
    
    
    result,text = model.estimate( iterable_pos_test, iterable_neg_test )
    
    rss=0
    i = 0

    for prob in result:
        i=i+1
        print prob
        if (prob[1] == 1):
            rss = rss + ((1 - prob[0][1])**2)
        else:
            rss = rss + ((1 - prob[0][0])**2)

    avg_rss= rss/i;
    text.append('RSS: %.2f' % rss)
    text.append('avg RSS: %2f' % avg_rss)

    for t in text:
        print t
def evaluate(pos_fname, neg_fname=None, size=None, percentages=None, n_repetitions=None, train_test_split=None):
    # initializing 
    graphs_pos = get_graphs(pos_fname, size=size)
    
    if neg_fname==None:
        graphs_neg = get_graphs_permuted(pos_fname, size=size)
    else:    
        graphs_neg = get_graphs(neg_fname, size=size)

    # train/test split
    from eden.util import random_bipartition_iter
    pos_train_global,pos_test_global = random_bipartition_iter(graphs_pos,train_test_split,random_state=random.random()*1000)
    neg_train_global,neg_test_global = random_bipartition_iter(graphs_neg,train_test_split,random_state=random.random()*1000)


    original_repetitions = []
    original_sample_repetitions = []
    sample_repetitions = []

    for percentage in percentages:
        originals = []
        originals_samples = []
        samples = []
        for repetition in range(n_repetitions):
            random_state = int(313379*percentage+repetition) 
            random.seed(random_state)
            pos_train_global,pos_train_global_ = tee(pos_train_global)
            neg_train_global,neg_train_global_ = tee(neg_train_global)
            pos_test_global,pos_test_global_ = tee(pos_test_global)
            neg_test_global,neg_test_global_ = tee(neg_test_global)

            # use shuffled list to create test and sample set
            pos,pos_reminder = random_bipartition_iter(pos_train_global_,percentage)
            pos,pos_ = tee(pos)
            neg,neg_reminder = random_bipartition_iter(neg_train_global_,percentage)
            neg,neg_ = tee(neg)

            #sample independently from the 2 classes
            logger.info('Positive')
            sampled_pos = fit_sample(pos_, random_state=random_state)
            logger.info('Negative')
            sampled_neg = fit_sample(neg_, random_state=random_state)

            #evaluate the predictive performance on held out test set
            start=time()
            logger.info( "="*80)
            logger.info( 'repetition: %d/%d'%(repetition+1, n_repetitions))
            logger.info( "training percentage:"+str(percentage))
            perf_orig,            perf_samp,            perf_orig_samp = fit_and_evaluate(pos,neg,
                                              sampled_pos,sampled_neg,
                                              pos_test_global_,neg_test_global_)
            logger.info( 'Time elapsed for full repetition: %.1f sec'%((time()-start)))
            originals.append(perf_orig)
            originals_samples.append(perf_orig_samp)
            samples.append(perf_samp)

        original_repetitions.append(originals)
        original_sample_repetitions.append(originals_samples)
        sample_repetitions.append(samples)
    
    return original_repetitions, original_sample_repetitions, sample_repetitions
Exemplo n.º 11
0
def train_stem_finder_model(fasta,model_stem_name,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r, train_test_split=0.7,neg_size_factor = 4, n_jobs=4, n_iter=40,fasta_test=None):

    ########### Pre processor ####################
    def pre_process_graph(iterator, **options):    
        from eden.converter.rna.rnasubopt import rnasubopt_to_eden
        graphs = rnasubopt_to_eden(iterator, **options)
        return graphs
    ########## Vectorizer ########################
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    ######### Estimator #########################
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True )

    
    def get_Cbox(seqs,window_c):
		import re
		for seq in seqs:
			header = seq[0].split('_')
			cpos = int(header[-3])
			nts = re.sub('\n','',seq[1])
			if (not((len(nts) < cpos+6+window_c) or (cpos-1-window_c < 0))):
				box = nts[cpos-1-window_c:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window_c]
				yield [seq,cpos],box

    def get_Dbox(seqs_c,window_d):
	    import re 
	    for [seq,cbox],pred in seqs_c:
			header = seq[0][0].split('_')
			dpos = int(header[-1])
			nts = re.sub('\n','',seq[0][1])
			if (not((len(nts) < dpos+3+window_d) or (dpos-1-window_d < 0))):
				box = nts[dpos-1-window_d:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window_d]
				yield [seq,cbox,pred,dpos],box
    
                    
    ######### Get stem #########################
    def get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r):
        from itertools import izip 
        import re
        from itertools import tee,islice
       
        #1)c_finder
        seqs_c = get_Cbox(seqs,window_c)
		

        #2)submit the Cbox candidates to the model
        from eden.model import ActiveLearningBinaryClassificationModel
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_c_name)
        
        seqs_c_pred = list()
        cands_c = list()
        max_count = 0        
        
        for seq_c in seqs_c:
            max_count +=1
            cands_c.append(seq_c)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_c)
				seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
				cands_c = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_c)
			seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
        
        #discard sequences with pred < 0
        seqs_c = list()
        for cand in seqs_c_pred:
			if (cand[1] >= 0.0):
				seqs_c.append(cand)
        
        
        #D_finder
        seqs_cd = get_Dbox(seqs_c,window_d)
        #submit Dboxes candidate to its model
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_d_name)
        
        seqs_d_pred = list()
        cands_d = list()
        max_count = 0        
        
        for seq_d in seqs_cd:
            max_count +=1
            cands_d.append(seq_d)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_d)
				seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
				cands_d = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_d)
			seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
		
	#Get the stem region from the sequences
        stem_cands=[]
        stem_info =[]
        #(([[(header, seq), pos_c], cand_c, pred_c, pos_d], 'UAAxCUGAyGAU'), 77.000434164559792)

        for ([[(header,nts),pos_c],cand_c,pred_c,pos_d],cand_d),pred_d in seqs_d_pred:
			#print header,'\t',seq,pos_c,'\t',cand_c,'\t',pred_c,'\t',cand_d,'\t',pred_d,"\n---\n" 
			if ( int(pos_c) - 10 < 0):
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
			else:
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
					
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
		
        return stem_cands
            
			
    
    #get positive data
    pos_cds=[]

    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(fasta)

    train_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
    train_pos = list(train_pos)

    

    #for h,seq in stems_cds:
	#	print h[0][0:10],'\t',seq

    #Generate Negative Dataset
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    train_neg = seq_to_seq(train_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
    train_neg = list(train_neg)
 
    
    #######Split the data into training and test
    if (fasta_test == None):
        print "Training and Test with the same dataset (different sequences)"
        #split train/test
        from eden.util import random_bipartition_iter
        iterable_pos_train, iterable_pos_test = random_bipartition_iter(train_pos, relative_size=train_test_split)
        iterable_neg_train, iterable_neg_test = random_bipartition_iter(train_neg, relative_size=train_test_split)
        
        
        iterable_pos_train = list(iterable_pos_train)
        iterable_neg_train = list(iterable_neg_train)
        
        iterable_pos_test = list(iterable_pos_test)
        iterable_neg_test = list(iterable_neg_test)
        
    
        
      
        

    else:        
        print "test dataset = ",fasta_test,"\n"
        pos_test_cds=[]
        neg_test_cds=[]

        from eden.converter.fasta import fasta_to_sequence
        seqs = fasta_to_sequence(fasta_test)

        test_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
        test_pos = list(test_pos)

        #Generate Negative test data
        test_neg = seq_to_seq(test_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
        test_neg = list(test_neg)
        
        iterable_pos_train = list(train_pos)
        iterable_neg_train = list(train_neg)
        iterable_pos_test  = list(test_pos)
        iterable_neg_test  = list(test_neg)
        
    print "Positive training samples: ",len(iterable_pos_train)
    print "Negative training samples: ",len(iterable_neg_train)
    print "--------\nPositive test samples: ",len(iterable_pos_test)
    print "Negative test samples: ",len(iterable_neg_test)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, 
                                                    estimator=estimator, 
                                                    vectorizer=vectorizer,
                                                    n_jobs=n_jobs)
    #optimize hyperparameters and fit model:
    from numpy.random import randint
    from numpy.random import uniform


    pre_processor_parameters={'energy_range':[3,4,5,6,7,8,9,10],
							   'max_num_subopts':randint(100,200,size=n_iter),
							   'max_num': [3,4,5,6,7,8]}


    vectorizer_parameters={'complexity':[2,3]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,0)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"],
                          'n_jobs':[n_jobs]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_stem_name,
                   max_total_time=60*60*24, n_iter=n_iter, 
                   n_active_learning_iterations=3,
                   cv=10,
                   score_func=lambda avg_score,std_score : avg_score - std_score * 2,
                   scoring='roc_auc', 
                   pre_processor_parameters = pre_processor_parameters,
		   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    print model.get_parameters()

    result,text = model.estimate( iterable_pos_test, iterable_neg_test )

    rss=0
    i = 0

    for prob in result:
        i=i+1
        #print prob
        if (prob[1] == 1):
            rss = rss + ((1 - prob[0][1])**2)
        else:
            rss = rss + ((1 - prob[0][0])**2)

    avg_rss= rss/i;
    text.append('RSS: %.2f' % rss)
    text.append('avg RSS: %2f' % avg_rss)

    for t in text:
        print t