示例#1
0
def get_sequences_with_names(size=9999, rand=0):
    if rand>0:
        sequences , boring = random_bipartition_iter(fasta_to_sequence("../toolsdata/%s.fa" % RFAM),.9,random_state=random.random()*rand)
        sequences = itertools.islice( sequences , size)
    else:
        sequences = itertools.islice( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size)
    return sequences
示例#2
0
def get_sequences_with_names(size=9999, rand=0):
    if rand > 0:
        sequences, boring = random_bipartition_iter(
            fasta_to_sequence("../toolsdata/%s.fa" % RFAM),
            .9,
            random_state=random.random() * rand)
        sequences = itertools.islice(sequences, size)
    else:
        sequences = itertools.islice(
            fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size)
    return sequences
示例#3
0
    def test_sequence_to_eden_id_attribute(self):
        """Test if networkx graph ids are set correctly to fasta header. -> header annotation won't be moved to garden"""

        fa_fn = "test/test_fasta_to_sequence_with_center_annotation.fa"
        graphs = sequence_to_eden(fasta_to_sequence(fa_fn))
        graph = graphs.next()
        assert graph.graph["id"] == "ID0 center:25"
def get_sequences_with_names(size=9999, rand=True):
    it = fasta_to_sequence("../toolsdata/%s.fa" % RFAM)
    it = list(it)
    if rand:
        #sequences , boring = random_bipartition_iter(it,.9,random_state=random.random())
        r = range(len(it))
        random.shuffle(r)
        return selection_iterator(it, r[:size])
    else:
        sequences = itertools.islice(it, size)
    return sequences
示例#5
0
def split_to_train_and_test(rfam_id=None, train_to_test_split_ratio=None, number_of_samples=None):

    iterable = fasta_to_sequence(rfam_url(rfam_id))
    if number_of_samples:
        iterable = islice(iterable, number_of_samples)
        logger.info('Experiment cunducted with %d sample sequences' %
                    number_of_samples)

    train, test = random_bipartition_iter(
        iterable, relative_size=train_to_test_split_ratio)
    return train, test
示例#6
0
    def test_fasta_to_sequence_no_normalize(self):
        """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest"""

        fa_fn = "test/test_fasta_to_sequence.fa"
        seq = fasta_to_sequence(fa_fn, normalize=False)
        assert (is_iterable(seq))
        (header, sequence) = seq.next()
        # sequence should correspond to the unmodified fasta string
        assert (
            sequence ==
            "gtggcgtactcacggccaCCTTAGGACTCCGCGGACTTTATGCCCACCAAAAAAACGAGCCGTTTCTACGCGTCCTCCGTCGCCTgtgtcgataaagcaa"
        )
示例#7
0
    def test_fasta_to_sequence_normalized(self):
        """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest"""

        fa_fn = "test/test_fasta_to_sequence.fa"
        seq = fasta_to_sequence(fa_fn, normalize=True)
        assert (is_iterable(seq))
        (header, sequence) = seq.next()
        # sequence should be uppercased and all Ts should be replaced by Us
        assert (
            sequence ==
            "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA"
        )
示例#8
0
    def test_fasta_to_sequence_default(self):
        """Test test_fasta_to_sequence with default parameters. -> moved to garden doctest"""

        fa_fn = "test/test_fasta_to_sequence.fa"
        seq = fasta_to_sequence(fa_fn)
        assert (is_iterable(seq))
        (header, sequence) = seq.next()
        # header should contain the fasta header with '>' removed
        assert (header == "ID0")
        # sequence should be uppercased and all Ts should be replaced by Us
        assert (
            sequence ==
            "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA"
        )
 def extract_box(data,window=3,box_type='C'):
     import re
     from eden.converter.fasta import fasta_to_sequence
     seqs = fasta_to_sequence(data)
     
     for seq in seqs:
         header = seq[0].split('_')
         cbox = header[-4]
         cpos = int(header[-3])
         dbox = header[-2]
         dpos = int(header[-1])
         
         nts = re.sub('\n','',seq[1])
         if box_type == 'C':
             if (not((len(nts) < cpos+6+window) or (cpos-1-window < 0))):
                 #box = nts[cpos-1-window:cpos+6+window]
                 box = nts[cpos-1-window:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window]
                 yield seq[0],box
示例#10
0
    def _fold_sequences(self):
        """Fold the RNA sequences using RNAplfold."""
        if self.verbose:
            print("Folding sequences using RNAplfold -W %i -L %i -c %f \
                --noLP..." % (self.window_size, self.max_bp_span,
                              self.avg_bp_prob_cutoff), end=' ')
            sys.stdout.flush()

        seqs = fasta_to_sequence(self.fasta)
        graphs = rnaplfold_to_eden(seqs,
                                   window_size=self.window_size,
                                   max_bp_span=self.max_bp_span,
                                   avg_bp_prob_cutoff=self.avg_bp_prob_cutoff,
                                   max_num_edges=1)
        if self.verbose:
            print("Done.\n")
            sys.stdout.flush()
        return graphs
示例#11
0
    def _fold_sequences(self):
        """Fold the RNA sequences using RNAplfold."""
        if self.verbose:
            print(
                "Folding sequences using RNAplfold -W %i -L %i -c %f \
                --noLP..." %
                (self.window_size, self.max_bp_span, self.avg_bp_prob_cutoff),
                end=' ')
            sys.stdout.flush()

        seqs = fasta_to_sequence(self.fasta)
        graphs = rnaplfold_to_eden(seqs,
                                   window_size=self.window_size,
                                   max_bp_span=self.max_bp_span,
                                   avg_bp_prob_cutoff=self.avg_bp_prob_cutoff,
                                   max_num_edges=1)
        if self.verbose:
            print("Done.\n")
            sys.stdout.flush()
        return graphs
def get_sequences_with_names(filename='RF00005.fa'):
    sequences = fasta_to_sequence("../toolsdata/"+filename)
    return sequences
示例#13
0
def fasta_to_list(fname):
    return [e for e in fasta_to_sequence(fname)]
def read_and_permute(samples='RF00005.fa'):
    for name,seq in fasta_to_sequence(samples):
        seq=permute_sequence(seq)
        yield (name,seq)
def get_seq_tups(fname, size, sizeb):
    kram = fasta_to_sequence("../toolsdata/" + fname)
    graphs = [g for g in kram]
    random.shuffle(graphs)
    return graphs[:size], graphs[size:size + sizeb]
def get_sequences_with_names(filename='RF00005.fa'):
    sequences = fasta_to_sequence("../toolsdata/" + filename)
    return sequences
示例#17
0
def test_fasta_to_sequence_graph():
    fa_fn = "test/test_fasta_to_sequence.fa"
    seq = fasta_to_sequence(fa_fn)
    sequence_to_eden(seq)
示例#18
0
def get_graphss(rfam_id='../toolsdata/RF00005'):
    return fasta_to_sequence(rfam_uri(rfam_id))
示例#19
0
def read_and_permute(samples='RF00005.fa'):
    for name, seq in fasta_to_sequence(samples):
        seq = permute_sequence(seq)
        yield (name, seq)
示例#20
0
        seqs = self._design(graphs)
        seqs = self._filter_seqs(seqs)
        return seqs

    def fit_sample(self, seqs):
        seqs, seqs_ = tee(seqs)
        seqs = self.fit(seqs).sample(seqs_)
        return seqs

    def predict(self, seqs):
        graphs = self.pre_processor.transform(seqs, mfe=True)
        predictions = self.vectorizer.predict(graphs, self.estimator)
        for prediction in predictions:
            yield prediction


if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)
    logger.info('Call to RNASynthesizer module.')

    rfam_id = 'RF01685'
    iterable_seq = fasta_to_sequence(
        'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'
        % (rfam_id, rfam_id))
    synthesizer = RNASynthesizerInitializer().synthesizer
    synth_seqs = synthesizer.fit_sample(iterable_seq)
    for header, seq in synth_seqs:
        print header
        print seq
示例#21
0
        graphs = self._filter_graphs(graphs)
        seqs = self._design(graphs)
        seqs = self._filter_seqs(seqs)
        return seqs

    def fit_sample(self, seqs):
        seqs, seqs_ = tee(seqs)
        seqs = self.fit(seqs).sample(seqs_)
        return seqs

    def predict(self, seqs):
        graphs = self.pre_processor.transform(seqs, mfe=True)
        predictions = self.vectorizer.predict(graphs, self.estimator)
        for prediction in predictions:
            yield prediction


if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)
    logger.info('Call to RNASynthesizer module.')

    rfam_id = 'RF01685'
    iterable_seq = fasta_to_sequence(
        'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0' % (rfam_id, rfam_id))
    synthesizer = RNASynthesizerInitializer().synthesizer
    synth_seqs = synthesizer.fit_sample(iterable_seq)
    for header, seq in synth_seqs:
        print header
        print seq
示例#22
0
def get_sequences(size=9999):
    sequences = itertools.islice( fasta_to_sequence("../example/RF00005.fa"), size)
    return [ b for (a,b) in sequences ]
示例#23
0
def get_sequences_with_names(size=9999):
    sequences = itertools.islice(fasta_to_sequence("../toolsdata/RF00005.fa"),
                                 size)
    return sequences
示例#24
0
def get_sequences(size=9999):
    sequences = itertools.islice(fasta_to_sequence("../toolsdata/RF00005.fa"),
                                 size)
    return [b for (a, b) in sequences]
示例#25
0
 def load_data(self, args):
     seqs = fasta_to_sequence(args.input_file)
     return seqs
def get_graphss(rfam_id="../toolsdata/RF00005"):
    return fasta_to_sequence(rfam_uri(rfam_id))
示例#27
0
def get_graphs(rfam_id='../example/RF00005', size=9999):
    seqs = fasta_to_sequence(rfam_uri(rfam_id))
    graphs = islice(
        clean(rnafold_to_eden(seqs, shape_type=5, energy_range=30, max_num=3)),
        size)
    return graphs
示例#28
0
def get_graphs(rfam_id = '../example/RF00005',size=9999):
    seqs = fasta_to_sequence(rfam_uri(rfam_id))
    graphs = islice( clean(rnafold_to_eden(seqs, shape_type=5, energy_range=30, max_num=3)), size)
    return graphs
import random
num_seqs = 400
minlen = 74
maxlen = 90
samples = 'RF00005.fa'
symbols = 'AUGC'

# analyse original
from eden.converter.fasta import fasta_to_sequence
# count all the symbols
symboldict = {symbol: 0 for symbol in symbols}
for name, seq in fasta_to_sequence(samples):
    for symbol in symboldict.keys():
        symboldict[symbol] += seq.count(symbol)


def choosesymbol(total, weights, symbols):
    i = random.randint(0, total)
    for e, w in enumerate(weights):
        i -= w
        if i <= 0:
            return symbols[e]
    print 'ERRER this should not happen. this means my code sucks'


def make_random_sequence(minlen, maxlen, weights, symbols, total):
    length = random.randint(minlen, maxlen)
    seq = [choosesymbol(total, weights, symbols) for i in xrange(length)]
    return ''.join(seq)

示例#30
0
def get_sequences_with_names(size=9999):
    sequences = itertools.islice( fasta_to_sequence("../example/RF00005.fa"), size)
    return sequences
def train_stem_finder_model(fasta,model_stem_name,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r, train_test_split=0.7,neg_size_factor = 4, n_jobs=4, n_iter=40,fasta_test=None):

    ########### Pre processor ####################
    def pre_process_graph(iterator, **options):    
        from eden.converter.rna.rnasubopt import rnasubopt_to_eden
        graphs = rnasubopt_to_eden(iterator, **options)
        return graphs
    ########## Vectorizer ########################
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    ######### Estimator #########################
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True )

    
    def get_Cbox(seqs,window_c):
		import re
		for seq in seqs:
			header = seq[0].split('_')
			cpos = int(header[-3])
			nts = re.sub('\n','',seq[1])
			if (not((len(nts) < cpos+6+window_c) or (cpos-1-window_c < 0))):
				box = nts[cpos-1-window_c:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window_c]
				yield [seq,cpos],box

    def get_Dbox(seqs_c,window_d):
	    import re 
	    for [seq,cbox],pred in seqs_c:
			header = seq[0][0].split('_')
			dpos = int(header[-1])
			nts = re.sub('\n','',seq[0][1])
			if (not((len(nts) < dpos+3+window_d) or (dpos-1-window_d < 0))):
				box = nts[dpos-1-window_d:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window_d]
				yield [seq,cbox,pred,dpos],box
    
                    
    ######### Get stem #########################
    def get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r):
        from itertools import izip 
        import re
        from itertools import tee,islice
       
        #1)c_finder
        seqs_c = get_Cbox(seqs,window_c)
		

        #2)submit the Cbox candidates to the model
        from eden.model import ActiveLearningBinaryClassificationModel
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_c_name)
        
        seqs_c_pred = list()
        cands_c = list()
        max_count = 0        
        
        for seq_c in seqs_c:
            max_count +=1
            cands_c.append(seq_c)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_c)
				seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
				cands_c = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_c)
			seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
        
        #discard sequences with pred < 0
        seqs_c = list()
        for cand in seqs_c_pred:
			if (cand[1] >= 0.0):
				seqs_c.append(cand)
        
        
        #D_finder
        seqs_cd = get_Dbox(seqs_c,window_d)
        #submit Dboxes candidate to its model
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_d_name)
        
        seqs_d_pred = list()
        cands_d = list()
        max_count = 0        
        
        for seq_d in seqs_cd:
            max_count +=1
            cands_d.append(seq_d)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_d)
				seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
				cands_d = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_d)
			seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
		
	#Get the stem region from the sequences
        stem_cands=[]
        stem_info =[]
        #(([[(header, seq), pos_c], cand_c, pred_c, pos_d], 'UAAxCUGAyGAU'), 77.000434164559792)

        for ([[(header,nts),pos_c],cand_c,pred_c,pos_d],cand_d),pred_d in seqs_d_pred:
			#print header,'\t',seq,pos_c,'\t',cand_c,'\t',pred_c,'\t',cand_d,'\t',pred_d,"\n---\n" 
			if ( int(pos_c) - 10 < 0):
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
			else:
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
					
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
		
        return stem_cands
            
			
    
    #get positive data
    pos_cds=[]

    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(fasta)

    train_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
    train_pos = list(train_pos)

    

    #for h,seq in stems_cds:
	#	print h[0][0:10],'\t',seq

    #Generate Negative Dataset
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    train_neg = seq_to_seq(train_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
    train_neg = list(train_neg)
 
    
    #######Split the data into training and test
    if (fasta_test == None):
        print "Training and Test with the same dataset (different sequences)"
        #split train/test
        from eden.util import random_bipartition_iter
        iterable_pos_train, iterable_pos_test = random_bipartition_iter(train_pos, relative_size=train_test_split)
        iterable_neg_train, iterable_neg_test = random_bipartition_iter(train_neg, relative_size=train_test_split)
        
        
        iterable_pos_train = list(iterable_pos_train)
        iterable_neg_train = list(iterable_neg_train)
        
        iterable_pos_test = list(iterable_pos_test)
        iterable_neg_test = list(iterable_neg_test)
        
    
        
      
        

    else:        
        print "test dataset = ",fasta_test,"\n"
        pos_test_cds=[]
        neg_test_cds=[]

        from eden.converter.fasta import fasta_to_sequence
        seqs = fasta_to_sequence(fasta_test)

        test_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
        test_pos = list(test_pos)

        #Generate Negative test data
        test_neg = seq_to_seq(test_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
        test_neg = list(test_neg)
        
        iterable_pos_train = list(train_pos)
        iterable_neg_train = list(train_neg)
        iterable_pos_test  = list(test_pos)
        iterable_neg_test  = list(test_neg)
        
    print "Positive training samples: ",len(iterable_pos_train)
    print "Negative training samples: ",len(iterable_neg_train)
    print "--------\nPositive test samples: ",len(iterable_pos_test)
    print "Negative test samples: ",len(iterable_neg_test)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, 
                                                    estimator=estimator, 
                                                    vectorizer=vectorizer,
                                                    n_jobs=n_jobs)
    #optimize hyperparameters and fit model:
    from numpy.random import randint
    from numpy.random import uniform


    pre_processor_parameters={'energy_range':[3,4,5,6,7,8,9,10],
							   'max_num_subopts':randint(100,200,size=n_iter),
							   'max_num': [3,4,5,6,7,8]}


    vectorizer_parameters={'complexity':[2,3]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,0)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"],
                          'n_jobs':[n_jobs]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_stem_name,
                   max_total_time=60*60*24, n_iter=n_iter, 
                   n_active_learning_iterations=3,
                   cv=10,
                   score_func=lambda avg_score,std_score : avg_score - std_score * 2,
                   scoring='roc_auc', 
                   pre_processor_parameters = pre_processor_parameters,
		   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    print model.get_parameters()

    result,text = model.estimate( iterable_pos_test, iterable_neg_test )

    rss=0
    i = 0

    for prob in result:
        i=i+1
        #print prob
        if (prob[1] == 1):
            rss = rss + ((1 - prob[0][1])**2)
        else:
            rss = rss + ((1 - prob[0][0])**2)

    avg_rss= rss/i;
    text.append('RSS: %.2f' % rss)
    text.append('avg RSS: %2f' % avg_rss)

    for t in text:
        print t