Пример #1
0
def main(dataset_df, bin=None, start=0, end=None):
    """
    Computes character frequencies (0.0 to 1.0) at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Compute counts
    counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end)

    # Create columns for profile_freqs table
    ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')]
    freq_cols = ['freq_'+c.split('_')[1] for c in ct_cols]

    # Compute frequencies from counts
    freq_df = counts_df[ct_cols].div(counts_df['ct'], axis=0)
    freq_df.columns = freq_cols
    freq_df['pos'] = counts_df['pos']

    # Validate as counts dataframe
    freq_df = qc.validate_profile_freq(freq_df,fix=True)
    return freq_df
Пример #2
0
def main(filelist_df,tags_df=None,indir='./',seq_type=None):
    """ Merges datasets listed in the filelist_df dataframe
    """

    # Validate filelist
    qc.validate_filelist(filelist_df)

    # Read datasets into dictionary indexed by bin number
    dataset_df_dict = {}
    for item in filelist_df.iterrows():
        # Autodetect fasta, fastq, or text file based on file extension
        fn = indir+item[1]['file']
        b = item[1]['bin']
        if re.search(fasta_filename_patterns,fn):
            df = io.load_dataset(fn,file_type='fasta',seq_type=seq_type)
        elif re.search(fastq_filename_patterns,fn):
            df = io.load_dataset(fn,file_type='fastq',seq_type=seq_type)
        else:
            df = io.load_dataset(fn,file_type='text',seq_type=seq_type)
        dataset_df_dict[b] = df

    # Merge datasets into one
    out_df = merge_datasets(dataset_df_dict)

    # Add seqs if given tags_df
    if not tags_df is None:
        qc.validate_tagkey(tags_df)
        tag_col = 'tag'

        # Test to make sure all tags in dataset are a subset of tags
        data_tags = set(out_df[tag_col])
        all_tags = set(tags_df[tag_col])
        if not (data_tags <= all_tags):
            sys.stderr.write('Some tags probably could not be identified.')

        # Get name of seq column       
        seq_cols = qc.get_cols_from_df(tags_df, 'seqs')
        if not len(seq_cols)==1:
            raise SortSeqError('Multiple seq columns; exaclty 1 required.')
        seq_col = seq_cols[0]

        # Set tag to be index column of dataframe
        tags_df = tags_df.set_index(tag_col)

        # Add seqs corresponding to each tag
        tags = out_df[tag_col]
        seqs = tags_df[seq_col][tags].values
        if not all([type(x)==str for x in seqs]):
            raise SortSeqError('Some looked-up seqs are not strings.')
        out_df[seq_col] = tags_df[seq_col][tags].values

    qc.validate_dataset(out_df)
    return out_df
Пример #3
0
def main(dataset_df, bin=None, start=0, end=None, err=False):
    """
    Computes the mutation rate (0.0 to 1.0) at each position. Mutation rate is defined as 1.0 minus the maximum character frequency at a position. Errors are estimated using bionomial uncertainty

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing results. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Compute counts
    counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end)

    # Create columns for profile_freqs table
    ct_cols = [c for c in counts_df.columns if qc.is_col_type(c, "ct_")]

    # Record positions in new dataframe
    mut_df = counts_df[["pos"]].copy()

    # Compute mutation rate across counts
    max_ct = counts_df[ct_cols].max(axis=1)
    sum_ct = counts_df[ct_cols].sum(axis=1)
    mut = 1.0 - (max_ct / sum_ct)
    mut_df["mut"] = mut

    # Computation of error rate is optional
    if err:
        mut_err = np.sqrt(mut * (1.0 - mut) / sum_ct)
        mut_df["mut_err"] = mut_err

    # Figure out which alphabet the cts dataframe specifies
    alphabet = "".join([c.split("_")[1] for c in ct_cols])
    seqtype = qc.alphabet_to_seqtype_dict[alphabet]
    wt_col = qc.seqtype_to_wtcolname_dict[seqtype]

    # Compute WT base at each position
    mut_df[wt_col] = "X"
    for col in ct_cols:
        indices = (counts_df[col] == max_ct).values
        mut_df.loc[indices, wt_col] = col.split("_")[1]

    # Validate as counts dataframe
    mut_df = qc.validate_profile_mut(mut_df, fix=True)
    return mut_df
Пример #4
0
    def test_preprocess(self):
        """ Test the ability of mpathic.preprocess to collate data in multiple sequence files
        """

        print '\nIn test_preprocess...'
        file_names = glob.glob(self.input_dir + 'files_*.txt')

        # Make sure there are files to test
        self.assertTrue(len(file_names) > 0)

        for file_name in file_names:
            print '\t%s =' % file_name,
            description = file_name.split('_')[-1].split('.')[0]

            # If fasta or fastq, assume dna
            if ('fasta' in file_name) or ('fastq' in file_name):
                seq_type = 'dna'
            else:
                seq_type = None

            executable = lambda: preprocess.main(io.load_filelist(file_name),
                                                 indir=self.input_dir,
                                                 seq_type=seq_type)

            # If _good_, then preprocess.main should produce a valid df
            if ('_good' in file_name) or ('_fix' in file_name):
                try:
                    df = executable()
                    qc.validate_dataset(df)
                    out_file = self.output_dir + 'dataset_%s.txt' % description
                    io.write(df, out_file)  # Test write
                    io.load_dataset(out_file)  # Test loading
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If _bad, then preprocess.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError, executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Пример #5
0
def main(dataset_df,model_df,left=None,right=None):

    # Validate dataframes
    qc.validate_dataset(dataset_df)
    qc.validate_model(model_df)

    # Detect model type based on columns
    seqtype, modeltype = qc.get_model_type(model_df)
    seqcol = qc.seqtype_to_seqcolname_dict[seqtype]

    # Set start and end  based on left or right
    if not ((left is None) or (right is None)):
        raise SortSeqError('Cannot set both left and right at same time.')
    if not (left is None):
        start = left
        end = start + model_df.shape[0] + (1 if modeltype=='NBR' else 0)
    elif not (right is None):
        end = right 
        start = end - model_df.shape[0] - (1 if modeltype=='NBR' else 0)
    else:
        start = model_df['pos'].values[0]
        end = model_df['pos'].values[-1] + (2 if modeltype=='NBR' else 1)
    assert start < end 

    # Validate start and end positions
    seq_length = len(dataset_df[seqcol][0])
    if start < 0:
        raise SortSeqError('Invalid start=%d'%start)
    if end > seq_length:
        raise SortSeqError('Invalid end=%d for seq_length=%d'%(end,seq_length))

    #select target sequence region
    out_df = dataset_df.copy()
    out_df.loc[:,'seq'] = out_df.loc[:,'seq'].str.slice(start,end)

    #Create model object of correct type
    if modeltype == 'MAT':
        mymodel = Models.LinearModel(model_df)
    elif modeltype == 'NBR':
        mymodel = Models.NeighborModel(model_df)
    else:
        raise SortSeqError('Unrecognized model type %s'%modeltype)
 
    # Compute values
    out_df['val'] = mymodel.evaluate(out_df)

    # Validate dataframe and return
    return qc.validate_dataset(out_df,fix=True)
Пример #6
0
def wrapper(args):

    try:
        npar = args.noiseparam.strip("[").strip("]").split(",")
    except:
        npar = []
    nbins = args.nbins
    # Run funciton
    if args.i:
        df = pd.io.parsers.read_csv(args.i, delim_whitespace=True, dtype={"seqs": str, "batch": int})
    else:
        df = pd.io.parsers.read_csv(sys.stdin, delim_whitespace=True, dtype={"seqs": str, "batch": int})
    if len(utils.get_column_headers(df)) > 0:
        raise SortSeqError("Library already sorted!")
    model_df = io.load_model(args.model)
    output_df = main(df, model_df, args.noisemodel, npar, nbins, start=args.start, end=args.end)

    if args.out:
        outloc = open(args.out, "w")
    else:
        outloc = sys.stdout
    pd.set_option("max_colwidth", int(1e8))

    # Validate dataframe for writting
    output_df = qc.validate_dataset(output_df, fix=True)
    io.write(output_df, outloc)
Пример #7
0
def wrapper(args):
    T_LibCounts = args.totallibcounts
    T_mRNACounts = args.totalmRNAcounts
    if T_LibCounts <=0 or T_mRNACounts <= 0:
        raise SortSeqError('Counts must be greater than zero')
    model_df = io.load_model(args.model)
    if args.i:
        df = pd.io.parsers.read_csv(args.i,delim_whitespace=True)
    else:
        df = pd.io.parsers.read_csv(sys.stdin,delim_whitespace=True)
    #make sure the library is not already sorted
    if len(utils.get_column_headers(df)) > 0:
         raise SortSeqError('Library already sorted!')
    header = df.columns
    libcounts,expcounts = main(df,model_df,T_LibCounts,T_mRNACounts,start=args.start,end=args.end)
    #add these counts to input dataframe
    lc = pd.Series(libcounts,name='ct_0')
    ec = pd.Series(expcounts,name='ct_1')
    df['ct_0'] = lc
    df['ct_1'] = ec
    df['ct'] = df[['ct_0','ct_1']].sum(axis=1)
    if args.out:
        outloc = open(args.out,'w')
    else:
        outloc = sys.stdout
    pd.set_option('max_colwidth',int(1e8))

    # Validate dataframe for writting
    df = qc.validate_dataset(df,fix=True)
    io.write(df,outloc)
Пример #8
0
    def test_preprocess(self):
        """ Test the ability of mpathic.preprocess to collate data in multiple sequence files
        """

        print '\nIn test_preprocess...'
        file_names = glob.glob(self.input_dir+'files_*.txt')

        # Make sure there are files to test
        self.assertTrue(len(file_names)>0)

        for file_name in file_names:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]

            # If fasta or fastq, assume dna
            if ('fasta' in file_name) or ('fastq' in file_name):
                seq_type = 'dna'
            else:
                seq_type = None

            executable = lambda: preprocess.main(io.load_filelist(file_name),indir=self.input_dir, seq_type=seq_type)

            # If _good_, then preprocess.main should produce a valid df
            if ('_good' in file_name) or ('_fix' in file_name):
                try:
                    df = executable()
                    qc.validate_dataset(df)
                    out_file = self.output_dir+'dataset_%s.txt'%description
                    io.write(df,out_file)       # Test write
                    io.load_dataset(out_file)   # Test loading
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If _bad, then preprocess.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Пример #9
0
def merge_datasets(dataset_df_dict):
    """
    Merges multiple datasets into one. Data from disparate files is merged via values in 'tag', seq', 'seq_rna', or 'seq_pro' columns (in order of preference, chosen according to availability). Each value in the 'ct' column of each dataset is recorded in the 'ct_[bin]' column of the final dataset. A total 'ct' column is then computed, and rows in the final dataset are sorted in descending order according to this. 

    Arguments:
        dataset_df_dict (dict): Keys are bin numbers, values are dataset dataframes

    Returns:
        out_df (pd.DataFrame): A validated dataset dataframe
    """
    # Make sure datasets were loaded
    if not len(dataset_df_dict)>=1:
        raise SortSeqError('No datasets were loaded')

    # Determine index column. Must be same for all files
    df = dataset_df_dict.values()[0]
    if 'tag' in df.columns:
        index_col = 'tag'
    elif 'seq' in df.columns:
        index_col = 'seq'
    elif 'seq_rna' in df.columns:
        index_col = 'seq_rna'
    elif 'seq_pro' in df.columns:
        index_col = 'seq_pro'

    # Concatenate dataset dataframes
    out_df = pd.DataFrame()
    for b in dataset_df_dict.keys():
        df = dataset_df_dict[b]

        # Verify that dataframe has correct column
        if not index_col in df.columns:
            raise SortSeqError('\
                Dataframe does not contain index_col="%s"'%index_col)
        if not 'ct' in df.columns:
            raise SortSeqError('\
                Dataframe does not contain a "ct" column')

        # Delete "ct_X" columns
        for col in df.columns:
            if qc.is_col_type(col,'ct_'):
                del df[col]

        # Add bin number to name of counts column. 
        df = df.rename(columns={'ct':'ct_%d'%b})

        # Index dataset by index_col 
        df = df.groupby(index_col).sum()

        # Concatenate 
        out_df = pd.concat([out_df,df],axis=1)

    # Rename index as tag
    out_df.reset_index(inplace=True)
    out_df.rename(columns = {'index':index_col},inplace=True) 

    # Fill undefined counts with zero
    out_df.fillna(value=0,inplace=True)

    # Add 'ct' column, with proper counts
    out_df['ct'] = 0
    for col in out_df.columns:
        if qc.is_col_type(col,'ct_'):
            out_df['ct'] += out_df[col]

    # Sort by 'ct' column
    out_df.sort('ct',ascending=False,inplace=True) 
    out_df.reset_index(drop=True,inplace=True)

    # Validate out_df as dataset and return it
    out_df = qc.validate_dataset(out_df,fix=True)
    return out_df
Пример #10
0
def main(dataset_df, err=False, method="naive", pseudocount=1.0, start=0, end=None):
    """
    Computes the mutual information (in bits), at each position, between the character and the bin number. 

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position
        method (str): Which method to use to estimate mutual information

    Returns:
        info_df (pd.DataFrame): A dataframe containing results.
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Get number of bins
    bin_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "ct_")]
    if not len(bin_cols) >= 2:
        raise SortSeqError("Information profile requires at least 2 bins.")
    bins = [int(c.split("_")[1]) for c in bin_cols]
    num_bins = len(bins)

    # Get number of characters
    seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "seqs")]
    if not len(seq_cols) == 1:
        raise SortSeqError("Must be only one seq column.")
    seq_col = seq_cols[0]
    seqtype = qc.colname_to_seqtype_dict[seq_col]
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    ct_cols = ["ct_" + a for a in alphabet]
    num_chars = len(alphabet)

    # Get sequence length and check start, end numbers
    num_pos = len(dataset_df[seq_col][0])
    if not (0 <= start < num_pos):
        raise SortSeqError("Invalid start==%d, num_pos==%d" % (start, num_pos))
    if end is None:
        end = num_pos
    elif end > num_pos:
        raise SortSeqError("Invalid end==%d, num_pos==%d" % (end, num_pos))
    elif end <= start:
        raise SortSeqError("Invalid: start==%d >= end==%d" % (start, end))

    # Record positions in new dataframe
    counts_df = profile_ct.main(dataset_df)
    info_df = counts_df.loc[start : (end - 1), ["pos"]].copy()  # rows from start:end
    info_df["info"] = 0.0
    if err:
        info_df["info_err"] = 0.0

    # Fill in 3D array of counts
    ct_3d_array = np.zeros([end - start, num_chars, num_bins])
    for i, bin_num in enumerate(bins):

        # Compute counts
        counts_df = profile_ct.main(dataset_df, bin=bin_num)

        # Fill in counts table
        ct_3d_array[:, :, i] = counts_df.loc[start : (end - 1), ct_cols].astype(float)

    # Compute mutual information for each position
    for i in range(end - start):  # i only from start:end

        # Get 2D counts
        nxy = ct_3d_array[i, :, :]
        assert len(nxy.shape) == 2

        # Compute mutual informaiton
        if err:
            mi, mi_err = info.estimate_mutualinfo(nxy, err=True, method=method, pseudocount=pseudocount)
            info_df.loc[i + start, "info"] = mi
            info_df.loc[i + start, "info_err"] = mi_err
        else:
            mi = info.estimate_mutualinfo(nxy, err=False, method=method, pseudocount=pseudocount)
            info_df.loc[i + start, "info"] = mi

    # Validate info dataframe
    info_df = qc.validate_profile_info(info_df, fix=True)
    return info_df
Пример #11
0
def main(wtseq=None, mutrate=0.10, numseq=10000,dicttype='dna',probarr=None,
        tags=False,tag_length=10):
    
    #generate sequence dictionary
    seq_dict,inv_dict = utils.choose_dict(dicttype)    
                
    mutrate = float(mutrate)
    if (mutrate < 0.0) or (mutrate > 1.0):
        raise SortSeqError('Invalid mutrate==%f'%mutrate)

    numseq = int(numseq)
    if (numseq <= 0):
        raise SortSeqError('numseq must be positive. Is %d'%numseq)

    tag_length = int(tag_length)
    if (tag_length <= 0):
        raise SortSeqErorr('tag_length must be positive. Is %d'%tag_length)

    if isinstance(probarr,np.ndarray):
        L = probarr.shape[1]
        #Generate bases according to provided probability matrix
        letarr = np.zeros([numseq,L])
        for z in range(L):
            letarr[:,z] = np.random.choice(
                range(len(seq_dict)),numseq,p=probarr[:,z]) 
    else:
        parr = []
        wtseq = wtseq.upper()
        L = len(wtseq)
        letarr = np.zeros([numseq,L])
        #Check to make sure the wtseq uses the correct bases.
        lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT')
        def check_sequences(s):
            return set(s).issubset(lin_seq_dict)
        if not check_sequences(wtseq):
            raise SortSeqError(
                'wtseq can only contain bases in ' + str(lin_seq_dict.keys()))        
        #find wtseq array 
        wtarr = seq2arr(wtseq,seq_dict)
        mrate = mutrate/(len(seq_dict)-1) #prob of non wildtype
        #Generate sequences by mutating away from wildtype
        '''probabilities away from wildtype (0 = stays the same, a 3 for 
            example means a C becomes an A, a 1 means C-> G)'''
        parr = np.array(
            [1-(len(seq_dict)-1)*mrate] 
            + [mrate for i in range(len(seq_dict)-1)])  
        #Generate random movements from wtseq
        letarr = np.random.choice(
            range(len(seq_dict)),[numseq,len(wtseq)],p=parr) 
        #Find sequences
        letarr = np.mod(letarr + wtarr,len(seq_dict))
    seqs= []
    #Convert Back to letters
    for i in range(numseq):
        seqs.append(arr2seq(letarr[i,:],inv_dict)) 

    seq_col = qc.seqtype_to_seqcolname_dict[dicttype]
    seqs_df = pd.DataFrame(seqs, columns=[seq_col])

    # If simulating tags, each generated seq gets a unique tag
    if tags:
        tag_seq_dict,tag_inv_dict = utils.choose_dict('dna')
        tag_alphabet_list = tag_seq_dict.keys()

        # Make sure tag_length is long enough for the number of tags needed
        if len(tag_alphabet_list)**tag_length < 2*numseq:
            raise SortSeqError(\
                'tag_length=%d is too short for num_tags_needed=%d'%\
                (tag_length,numseq))

        # Generate a unique tag for each unique sequence
        tag_set = set([])
        while len(tag_set) < numseq:
            num_tags_left = numseq - len(tag_set)
            new_tags = [''.join(choice(tag_alphabet_list,size=tag_length)) \
                for i in range(num_tags_left)]
            tag_set = tag_set.union(new_tags)

        df = seqs_df.copy()
        df.loc[:,'ct'] = 1
        df.loc[:,'tag'] = list(tag_set)

    # If not simulating tags, list only unique seqs w/ corresponding counts
    else:
        seqs_counts = seqs_df[seq_col].value_counts()
        df = seqs_counts.reset_index()
        df.columns = [seq_col,'ct']

    # Convert into valid dataset dataframe and return
    return qc.validate_dataset(df,fix=True)
Пример #12
0
def main(dataset_df,
         bin=None,
         start=0,
         end=None,
         bins_df=None,
         pseudocounts=1,
         return_profile=False):
    """
    Computes character frequencies (0.0 to 1.0) at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing counts for each 
        nucleotide/amino acid character at each position. 
    """
    seq_cols = qc.get_cols_from_df(dataset_df, 'seqs')
    if not len(seq_cols) == 1:
        raise SortSeqError('Dataframe has multiple seq cols: %s' %
                           str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = utils.choose_dict(dicttype)
    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    #for each bin we need to find character frequency profile, then sum over all
    #bins to get activity.

    #first make sure we have activities of each bin:
    if not bins_df:
        bins = utils.get_column_headers(dataset_df)
        #in this case no activity was specified so just assume the activity
        #equals bin number
        activity = [float(b.split('_')[-1]) for b in bins]
    else:
        bins = list(bins_df['bins'])
        activity = list(bins_df['activity'])

    #initialize dataframe for total counts in all bins
    output_ct_df = pd.DataFrame()
    #initialize dataframe for running activity calculation
    output_activity_df = pd.DataFrame()

    for i, b in enumerate(bins):
        bin_num = int(b.split('_')[-1])
        # Compute counts
        counts_df = profile_ct.main(dataset_df,
                                    bin=bin_num,
                                    start=start,
                                    end=end)

        # Create columns for profile_freqs table
        ct_cols = utils.get_column_headers(counts_df)
        #add_pseudocounts
        counts_df[ct_cols] = counts_df[ct_cols] + pseudocounts

        #add to all previous bin counts
        #print output_activity_df
        if i == 0:
            output_ct_df = counts_df[ct_cols]
            output_activity_df = counts_df[ct_cols] * activity[i]
        else:
            output_ct_df = output_ct_df + counts_df[ct_cols]
            output_activity_df = output_activity_df + counts_df[
                ct_cols] * activity[i]

    #now normalize by each character at each position, this is the activity
    #profile

    output_activity_df = output_activity_df[ct_cols].div(output_ct_df[ct_cols])

    mut_rate = profile_mut.main(dataset_df, bin=bin)
    freq = profile_freq.main(dataset_df, bin=bin)
    freq_cols = [x for x in freq.columns if 'freq_' in x]
    #now normalize by the wt activity
    wtseq = ''.join(mut_rate['wt'])
    wtarr = utils.seq2mat(wtseq, seq_dict)

    wt_activity = np.transpose(wtarr) * (np.array(output_activity_df[ct_cols]))

    #sum this to get total
    wt_activity2 = wt_activity.sum(axis=1)
    delta_activity = output_activity_df.subtract(pd.Series(wt_activity2),
                                                 axis=0)
    if return_profile:
        #first find mutation rate according to formula in SI text
        profile_delta_activity = mut_rate['mut']*np.sum(
            (1-np.transpose(wtarr))*np.array(\
            freq[freq_cols])*np.array(delta_activity),axis=1)
        #format into dataframe
        output_df = pd.DataFrame()
        output_df['pos'] = range(start,
                                 start + len(profile_delta_activity.index))
        output_df['mut_activity'] = profile_delta_activity
        return output_df
    else:
        #just add pos column and rename counts columns to activity columns
        output_df = pd.DataFrame(delta_activity)
        output_df.insert(0, 'pos',
                         range(start, start + len(delta_activity.index)))
        #reorder columns

        activity_col_dict = {x:'activity_' + x.split('_')[-1] \
            for x in delta_activity.columns if 'ct_' in x}
        output_df = output_df.rename(columns=activity_col_dict)
        return output_df
Пример #13
0
def main(dataset_df, bin=None, start=0, end=None):
    """
    Computes character counts at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        counts_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Retrieve type of sequence
    seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'seqs')]
    if not len(seq_cols)==1:
        raise SortSeqError('Dataset dataframe must have only one seq colum.')
    colname = seq_cols[0]
    seqtype = qc.colname_to_seqtype_dict[colname]
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    num_chars = len(alphabet)

    # Retrieve sequence length
    if not dataset_df.shape[0] > 1:
        raise SortSeqError('Dataset dataframe must have at least one row.')
    total_seq_length = len(dataset_df[colname].iloc[0])

    # Validate start and end
    if start<0:
        raise SortSeqError('start=%d is negative.'%start)
    elif start>=total_seq_length:
        raise SortSeqError('start=%d >= total_seq_length=%d'%\
            (start,total_seq_length))

    if end is None:
        end=total_seq_length
    elif end<=start:
        raise SortSeqError('end=%d <= start=%d.'%(end,start))
    elif end>total_seq_length:
        raise SortSeqError('end=%d > total_seq_length=%d'%\
            (start,total_seq_length))

    # Set positions
    poss = pd.Series(range(start,end),name='pos')
    num_poss = len(poss)

    # Retrieve counts
    if bin is None:
        ct_col = 'ct'
    else:
        ct_col = 'ct_%d'%bin
    if not ct_col in dataset_df.columns:
        raise SortSeqError('Column "%s" is not in columns=%s'%\
            (ct_col,str(dataset_df.columns)))
    counts = dataset_df[ct_col]

    # Compute counts profile
    counts_array = np.zeros([num_poss,num_chars])
    counts_cols = ['ct_'+a for a in alphabet]
    for i,pos in enumerate(range(start,end)):
        char_list = dataset_df[colname].str.slice(pos,pos+1)
        counts_array[i,:] = [np.sum(counts[char_list==a]) for a in alphabet]
    temp_df = pd.DataFrame(counts_array,columns=counts_cols)
    counts_df = pd.concat([poss,temp_df],axis=1)

    # Validate as counts dataframe
    counts_df = qc.validate_profile_ct(counts_df,fix=True)
    return counts_df