示例#1
0
print 'cython rc: %f sec to rc one dna seq of length %d'%(c_time,len(seq))

print '%.1f-fold speedup.'%(p_time/c_time)

print '-----------------------------'
# Test seq2sitelist

site_length = 20
t = time.time()
x = [seq[i:(i+site_length)] for i in range(len(seq)-site_length+1)]
p_time = time.time()-t
print 'python seq2sitelist: %f sec to chop a seq into %d sites'%\
    (p_time,len(x))

t = time.time()
x = fast.seq2sitelist(seq,site_length, safe=False)
c_time = time.time()-t
print 'cython seq2sitelist: %f sec to chop a seq into %d sites'%\
    (c_time,len(x))

print '%.1f-fold speedup.'%(p_time/c_time)

print '-----------------------------'
# Test seq2sitelist rc feature

site_length = 20
t = time.time()
x = fast.seq2sitelist(seq,site_length, safe=False)
y = [fast.reverse_complement(s, safe=False) for s in x] 
p_time = time.time()-t
print 'python, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\
示例#2
0
def main(model_df, contig_list, numsites=10, verbose=False):

    # Determine type of string from model
    qc.validate_model(model_df)
    seqtype, modeltype = qc.get_model_type(model_df)
    seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)

    # Check that all characters are from the correct alphabet
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    search_string = r"[^%s]"%alphabet
    for contig_str, contig_name, pos_offset in contig_list:
        if re.search(search_string,contig_str):
            raise SortSeqError(\
                'Invalid character for seqtype %s found in %s.'%\
                (seqtype,contig_name))

    # Create model object to evaluate on seqs
    if modeltype == 'MAT':
        model_obj = Models.LinearModel(model_df)
    elif modeltype == 'NBR':
        model_obj = Models.NeighborModel(model_df)
    
    # Create list of dataframes, one for each contig
    seq_col = qc.seqtype_to_seqcolname_dict[seqtype]
    L = model_obj.length
    sitelist_df = pd.DataFrame(\
            columns=['val',seq_col,'left','right','ori','contig'])
    for contig_str, contig_name, pos_offset in contig_list:
        if len(contig_str) < L:
            continue
        this_df = pd.DataFrame(\
            columns=['val',seq_col,'left','right','ori','contig'])
        num_sites = len(contig_str) - L + 1
        poss = np.arange(num_sites).astype(int) 
        this_df['left'] = poss + pos_offset
        this_df['right']  = poss + pos_offset + L - 1 
        #this_df[seq_col] = [contig_str[i:(i+L)] for i in poss]
        this_df[seq_col] = fast.seq2sitelist(contig_str,L)  #Cython
        this_df['ori'] = '+'
        this_df['contig'] = contig_name
        this_df['val'] = model_obj.evaluate(this_df[seq_col])
        sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True)

        # If scanning DNA, scan reverse-complement as well
        if seqtype=='dna':
            #this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]]
            this_df[seq_col] = fast.seq2sitelist(contig_str,L,rc=True)  #Cython
            this_df['ori'] = '-'
            this_df['val'] = model_obj.evaluate(this_df[seq_col])
            sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True)

        # Sort by value and reindex
        sitelist_df.sort_values(by='val', ascending=False, inplace=True)
        sitelist_df.reset_index(drop=True,inplace=True)

        # Crop list at numsites
        if sitelist_df.shape[0]>numsites:
            sitelist_df.drop(sitelist_df.index[numsites:], inplace=True)

        if verbose:
            print '.',
            sys.stdout.flush()

    if verbose:
        print ''
        sys.stdout.flush()

    # If no sites were found, raise error
    if sitelist_df.shape[0]==0:
        raise SortSeqError(\
            'No full-length sites found within provided contigs.')

    sitelist_df = qc.validate_sitelist(sitelist_df,fix=True)
    return sitelist_df