print 'cython rc: %f sec to rc one dna seq of length %d'%(c_time,len(seq)) print '%.1f-fold speedup.'%(p_time/c_time) print '-----------------------------' # Test seq2sitelist site_length = 20 t = time.time() x = [seq[i:(i+site_length)] for i in range(len(seq)-site_length+1)] p_time = time.time()-t print 'python seq2sitelist: %f sec to chop a seq into %d sites'%\ (p_time,len(x)) t = time.time() x = fast.seq2sitelist(seq,site_length, safe=False) c_time = time.time()-t print 'cython seq2sitelist: %f sec to chop a seq into %d sites'%\ (c_time,len(x)) print '%.1f-fold speedup.'%(p_time/c_time) print '-----------------------------' # Test seq2sitelist rc feature site_length = 20 t = time.time() x = fast.seq2sitelist(seq,site_length, safe=False) y = [fast.reverse_complement(s, safe=False) for s in x] p_time = time.time()-t print 'python, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\
def main(model_df, contig_list, numsites=10, verbose=False): # Determine type of string from model qc.validate_model(model_df) seqtype, modeltype = qc.get_model_type(model_df) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) # Check that all characters are from the correct alphabet alphabet = qc.seqtype_to_alphabet_dict[seqtype] search_string = r"[^%s]"%alphabet for contig_str, contig_name, pos_offset in contig_list: if re.search(search_string,contig_str): raise SortSeqError(\ 'Invalid character for seqtype %s found in %s.'%\ (seqtype,contig_name)) # Create model object to evaluate on seqs if modeltype == 'MAT': model_obj = Models.LinearModel(model_df) elif modeltype == 'NBR': model_obj = Models.NeighborModel(model_df) # Create list of dataframes, one for each contig seq_col = qc.seqtype_to_seqcolname_dict[seqtype] L = model_obj.length sitelist_df = pd.DataFrame(\ columns=['val',seq_col,'left','right','ori','contig']) for contig_str, contig_name, pos_offset in contig_list: if len(contig_str) < L: continue this_df = pd.DataFrame(\ columns=['val',seq_col,'left','right','ori','contig']) num_sites = len(contig_str) - L + 1 poss = np.arange(num_sites).astype(int) this_df['left'] = poss + pos_offset this_df['right'] = poss + pos_offset + L - 1 #this_df[seq_col] = [contig_str[i:(i+L)] for i in poss] this_df[seq_col] = fast.seq2sitelist(contig_str,L) #Cython this_df['ori'] = '+' this_df['contig'] = contig_name this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True) # If scanning DNA, scan reverse-complement as well if seqtype=='dna': #this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]] this_df[seq_col] = fast.seq2sitelist(contig_str,L,rc=True) #Cython this_df['ori'] = '-' this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True) # Sort by value and reindex sitelist_df.sort_values(by='val', ascending=False, inplace=True) sitelist_df.reset_index(drop=True,inplace=True) # Crop list at numsites if sitelist_df.shape[0]>numsites: sitelist_df.drop(sitelist_df.index[numsites:], inplace=True) if verbose: print '.', sys.stdout.flush() if verbose: print '' sys.stdout.flush() # If no sites were found, raise error if sitelist_df.shape[0]==0: raise SortSeqError(\ 'No full-length sites found within provided contigs.') sitelist_df = qc.validate_sitelist(sitelist_df,fix=True) return sitelist_df