Exemplo n.º 1
0
 def _update_subtree_sec(self, item, data):
     """ Add a new subtree to the current QTreeWidgetItem. """
     if not isinstance(data, dict):
         for s in self.similar_items:
             sitem = QTreeWidgetItem([None, s])
             sitem.setToolTip(1, s)
             item.addChild(sitem)
         if not isinstance(data, self.noPrintTypes):
             for c in range(item.childCount()):
                 item.child(c).setCheckState(0, Qt.Unchecked)
                 self.checkableItems.append(item.child(c))
     else:
         for n, k in enumerate(realsorted(data.keys(), key=_lowercase)):
             item.addChild(QTreeWidgetItem([None, k]))
             child = item.child(n)
             if isinstance(data[k], dict):
                 self._update_subtree(child, data[k])
             else:
                 for s in self.similar_items:
                     sitem = QTreeWidgetItem([None, s])
                     sitem.setToolTip(0, s)
                     child.addChild(sitem)
                 if not isinstance(data[k], self.noPrintTypes):
                     for c in range(child.childCount()):
                         child.child(c).setCheckState(0, Qt.Unchecked)
                         self.checkableItems.append(child.child(c))
Exemplo n.º 2
0
    def _make_groups(self, trajectory_categories, sort_category):
        r"""Groups the sample ids in `self._metadata_map` by the values in
        `trajectory_categories`

        Creates `self._groups`, a dictionary keyed by category and values are
        dictionaries in which the keys represent the group name within the
        category and values are ordered lists of sample ids

        If `sort_category` is not None, the sample ids are sorted based on the
        values under this category in the metadata map. Otherwise, they are
        sorted using the sample id.

        Parameters
        ----------
        trajectory_categories : list of str
            A list of metadata categories to use to create the groups.
            Default: None, compute all of them
        sort_category : str or None
            The category from self._metadata_map to use to sort groups
        """
        # If sort_category is provided, we used the value of such category to
        # sort. Otherwise, we use the sample id.
        if sort_category:
            def sort_val(sid):
                return self._metadata_map[sort_category][sid]
        else:
            def sort_val(sid):
                return sid

        self._groups = defaultdict(dict)
        for cat in trajectory_categories:
            # Group samples by category
            gb = self._metadata_map.groupby(cat)
            for g, df in gb:
                self._groups[cat][g] = realsorted(df.index, key=sort_val)
Exemplo n.º 3
0
    def _make_groups(self, trajectory_categories, sort_category):
        r"""Groups the sample ids in `self._metadata_map` by the values in
        `trajectory_categories`

        Creates `self._groups`, a dictionary keyed by category and values are
        dictionaries in which the keys represent the group name within the
        category and values are ordered lists of sample ids

        If `sort_category` is not None, the sample ids are sorted based on the
        values under this category in the metadata map. Otherwise, they are
        sorted using the sample id.

        Parameters
        ----------
        trajectory_categories : list of str
            A list of metadata categories to use to create the groups.
            Default: None, compute all of them
        sort_category : str or None
            The category from self._metadata_map to use to sort groups
        """
        # If sort_category is provided, we used the value of such category to
        # sort. Otherwise, we use the sample id.
        if sort_category:
            def sort_val(sid):
                return self._metadata_map[sort_category][sid]
        else:
            def sort_val(sid):
                return sid

        self._groups = defaultdict(dict)
        for cat in trajectory_categories:
            # Group samples by category
            gb = self._metadata_map.groupby(cat)
            for g, df in gb:
                self._groups[cat][g] = realsorted(df.index, key=sort_val)
Exemplo n.º 4
0
 def _update_subtree(self, item, data):
     """ Add a new subtree to the current QTreeWidgetItem. """
     for n, k in enumerate(realsorted(data.keys(), key=_lowercase)):
         item.addChild(QTreeWidgetItem([None, k]))
         child = item.child(n)
         if isinstance(data[k], dict):
             self._update_subtree(child, data[k])
         elif not isinstance(data[k], self.noPrintTypes):
             child.setCheckState(0, Qt.Unchecked)
             self.checkableItems.append(child)
Exemplo n.º 5
0
def generate_database_lines(data):
    lines = []
    for dat in data:
        s = '%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (
            dat['cid'], dat['CAS'], dat['formula'], round(dat['MW'], 9),
            dat['smiles'], dat['inchi'], dat['inchikey'], dat['name'])
        s += '\t'.join(dat['synonyms'])
        s += '\n'
        lines.append(s)
    lines = realsorted(lines)
    return lines
Exemplo n.º 6
0
def calc_summary_stats(output_info, cutoff):
    """Calculate average read depth, number of peaks, standard
    deviation and report each peak for each msi range in the bed file
    """
    sites={}
    #msi_info is all loci for this chromosome
    for name, info in output_info.items():
        #Set total average depth for this site
        if info['total_depth']!= 0 and info['total_sites'] != 0:
            #Use ceil to round up
            average_depth=ceil(float(info['total_depth'])/info['total_sites'])
            #Turn to int
            average_depth=int(average_depth)
        else:
            average_depth=0
        if average_depth != 0 and info['total_mutant_depth'] <= average_depth:
             wildtype_ave_depth=int(average_depth-info['total_mutant_depth'])
             wildtype_fraction=float(wildtype_ave_depth)/average_depth
        else:
            wildtype_fraction, wildtype_ave_depth=0,0
            sites[0]='0:0:0'
        if info['indels']:
            highest_frac = calc_highest_peak(info['indels'], wildtype_fraction, average_depth)
            sites=calc_wildtype(list(info['indels'].keys()), wildtype_ave_depth, wildtype_fraction, highest_frac)
            num_peaks, peaks=calc_number_peaks(info['indels'], sites, highest_frac, cutoff)
            stdev=calc_std_peaks(peaks.values())
            #Sort the peak list naturally (-3,-2,-1,0,1,2,3)
            peak_list=(" ").join(str(x) for x in natsort.realsorted(peaks.values()))
        elif average_depth !=0:
            #if there are no indels, but there are wild type reads
            wildtype_fraction=1
            sites[0]=(":").join(['0', str(float(wildtype_fraction)), str(wildtype_ave_depth)])
            num_peaks=1
            peak_list=sites[0]
            stdev=0
        else:
            #if there are no reads at this site
            wildtype_fraction=0
            sites[0]=(":").join(['0', str(float(wildtype_fraction)), str(wildtype_ave_depth)])
            num_peaks=0
            peak_list=sites[0]
            stdev=0
            
        output_info[name]={'Name':info['Name'],
                           'Average_Depth':average_depth,
                           'Standard_Deviation':stdev,
                           'Number_of_Peaks':num_peaks,
                           'IndelLength:AlleleFraction:SupportingCalls':peak_list}
    return output_info
Exemplo n.º 7
0
def rsort(strings_to_sort):
  strs = []
  for arg in strings_to_sort:
    if '*' in arg:
      res = glob.glob(arg)
    else:
      # TODO: check if file
      res = arg
    strs.append(res)
  strs = natsort.realsorted(flatten_list(strs))

  for string in strs:
    print(string)
  
  return strs
Exemplo n.º 8
0
 def _dlg_combine(self):
     """ Open a dialog to combine the dataset. """
     trace = self._get_obj_trace(self.datatree.current_item())
     data = self.get(trace)
     keys = realsorted(data, key=lambda x: x.lower())
     d0 = data.get(keys[0])
     npt = self.noPrintTypes + (dict,)
     # Search for occurences of arrays with the same shape as the first one
     if isinstance(d0, npt):
         n_keys = [k for k in keys if isinstance(data.get(k), type(d0))]
         data_shape = ()
     else:
         n_keys = []
         for k in keys:
             if not isinstance(data.get(k), npt):
                 if data.get(k).shape == d0.shape:
                     n_keys.append(k)
         data_shape = data.get(n_keys[0]).shape
     # Show a dialog asking if the conversion should be done.
     if len(data_shape) > 1:
         txt = "Combine the first {} datasets of {} element(s) into one?"
         txt = txt.format(len(n_keys), data_shape)
     else:
         txt = "Combine {} elements into 1D vector?".format(len(n_keys))
     btns = (QMessageBox.Yes|QMessageBox.No)
     msg = QMessageBox(QMessageBox.Information, "Info", txt, buttons=btns)
     msg.setDefaultButton(QMessageBox.Yes)
     if msg.exec_() != QMessageBox.Yes:
         return
     # Add 'combined' if not all values are combined or it is a topLevelItem
     if len(n_keys) != len(keys) or len(trace) == 1:
         trace.append('combined')
     # Perform the combination
     try:
         self.set_data(trace, np.array([data.get(k) for k in n_keys]))
     except ValueError:
         # For h5py dictionaries
         self.set_data(trace, np.array([data.get(k)[()] for k in n_keys]))
     # Remove the combined data
     if len(n_keys) != len(keys) and len(data_shape) > 1:
         _ = [self.get(trace[:-1]).pop(key) for key in n_keys]
     # Put new dimension at the end and remove singleton dimensions.
     self.set_data(trace, np.moveaxis(self.get(trace), 0, -1).squeeze())
     self.datatree.update_tree()
Exemplo n.º 9
0
    def __sort_data_by_label_order(self):
        '''
		Uses the natsort package's realsorted
		to sort strings such that substrings that are 
		numeric values are sorted in numeric order, 
		and non-numeric substrings sorted lexigraphically.
		'''

        ls_natsorted_labels = realsorted(self.current_data['labels'])

        l_idx_labels_sorted=[ self.current_data[ 'labels' ].index( s_label ) \
            for s_label in ls_natsorted_labels ]

        ls_sorted_labels = [
            self.current_data['labels'][idx] for idx in l_idx_labels_sorted
        ]
        ls_sorted_value_lists = [
            self.current_data['value_lists'][idx]
            for idx in l_idx_labels_sorted
        ]

        self.current_data['labels'] = ls_sorted_labels
        self.current_data['value_lists'] = ls_sorted_value_lists
        return
Exemplo n.º 10
0
def get_articles_files():
    return realsorted(glob.glob("{}/**/*.md".format(INPUT_DIRECTORY)))
Exemplo n.º 11
0
"""
Spyder Editor

This is a temporary script file.
"""
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
import re
import matplotlib.ticker as ticker
import glob
import natsort

f1 = glob.glob('u*.txt')
f1 = natsort.realsorted(f1)
f2 = glob.glob('head&tail.txt')
f2 = natsort.realsorted(f2)


def readData(filename):
    d = pd.read_csv(filename, delim_whitespace=True, header=None)
    d = np.asarray(d)
    return d


dSlug = readData(f2[0])
lSlug = dSlug[:, 1] - dSlug[:, 0]
meanSlug = np.mean(lSlug)
stdSlug = np.std(lSlug)
t = np.arange(len(lSlug))
def test_realsorted_is_identical_to_natsorted_with_real_alg(float_list):
    assert realsorted(float_list) == natsorted(float_list, alg=ns.REAL)
Exemplo n.º 13
0
humansorted(a)

a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana']
natsorted(a)
natsorted(a, alg=ns.IGNORECASE)
natsorted(a, alg=ns.LOWERCASEFIRST)
natsorted(a, alg=ns.GROUPLETTERS)
natsorted(a, alg=ns.G | ns.LF)

a = ['a50', 'a51.', 'a+50.4', 'a5.034e1', 'a+50.300']
natsorted(a, alg=ns.FLOAT)
natsorted(a, alg=ns.FLOAT | ns.SIGNED)
natsorted(a, alg=ns.FLOAT | ns.SIGNED | ns.NOEXP)
natsorted(a, alg=ns.REAL)
from natsort import realsorted
realsorted(a)

from operator import attrgetter, itemgetter
a = [['a', 'num4'], ['b', 'num8'], ['c', 'num2']]
natsorted(a, key=itemgetter(1))


class Foo:
    def __init__(self, bar):
        self.bar = bar

    def __repr__(self):
        return "Foo('{0}')".format(self.bar)


b = [Foo('num3'), Foo('num5'), Foo('num2')]
Exemplo n.º 14
0
def df_read_filecols(df, filecols, *, order_sites=True):
    """Merges data frame with entries read from files.

    Designed to expand data frame listing CSV files with site or
    mutation-level selection information into a data frame listing
    the information in these CSV files.

    Args:
        `df` (pandas DataFrame)
            Each row gives files and associated information.
        `filecols` (list)
            List of columns in `df` that give filenames of CSV files
            to add to data frame. These CSV files cannot have column
            names already in `df`.
        `order_sites` (bool)
            Expect a `site` column, make it naturally sorted
            categorical variable, and add `isite` column
            that numbers sites 0, 1, ...

    Returns:
        A data frame where the entries in the files are now
        read as columns.

    >>> tf = tempfile.NamedTemporaryFile
    >>> with tf(mode='w') as sitediffsel1, tf(mode='w') as sitediffsel2, \\
    ...      tf(mode='w') as mutdiffsel1,  tf(mode='w') as mutdiffsel2:
    ...
    ...     # first sitediffsel file
    ...     _ = sitediffsel1.write('site,sitediffsel\\n'
    ...                            '1,3.2\\n'
    ...                            '-1,2.3\\n'
    ...                            '(HA2)1,0.1')
    ...     sitediffsel1.flush()
    ...
    ...     # first mutdiffsel file
    ...     _ = mutdiffsel1.write('site,wildtype,mutation,mutdiffsel\\n'
    ...                            '-1,A,C,-0.7\\n'
    ...                            '-1,A,G,3.0\\n'
    ...                            '1,C,A,1.2\\n'
    ...                            '1,C,G,2.0\\n'
    ...                            '(HA2)1,C,A,0.0\\n'
    ...                            '(HA2)1,C,G,0.1')
    ...     mutdiffsel1.flush()
    ...
    ...     # second sitediffsel file
    ...     _ = sitediffsel2.write('site,sitediffsel\\n'
    ...                            '(HA2)1,9.1\\n'
    ...                            '1,1.2\\n'
    ...                            '-1,0.3\\n')
    ...     sitediffsel2.flush()
    ...
    ...     # second mutdiffsel file
    ...     _ = mutdiffsel2.write('site,wildtype,mutation,mutdiffsel\\n'
    ...                            '-1,A,C,-0.2\\n'
    ...                            '-1,A,G,0.5\\n'
    ...                            '1,C,A,1.1\\n'
    ...                            '1,C,G,0.1\\n'
    ...                            '(HA2)1,C,A,9.0\\n'
    ...                            '(HA2)1,C,G,0.1')
    ...     mutdiffsel2.flush()
    ...
    ...     # data frame with files as columns
    ...     df = pandas.DataFrame({
    ...          'name':['sample_1', 'sample_2'],
    ...          'serum':['serum_1', 'serum_1'],
    ...          'sitediffsel_file':[sitediffsel1.name, sitediffsel2.name],
    ...          'mutdiffsel_file':[mutdiffsel1.name, mutdiffsel2.name]
    ...          })
    ...
    ...     # call df_read_filecols
    ...     (df_read_filecols(df, ['sitediffsel_file', 'mutdiffsel_file'])
    ...      .drop(columns=['sitediffsel_file', 'mutdiffsel_file']))
            name    serum    site  sitediffsel wildtype mutation  mutdiffsel  isite
    0   sample_1  serum_1       1          3.2        C        A         1.2      1
    1   sample_1  serum_1       1          3.2        C        G         2.0      1
    2   sample_1  serum_1      -1          2.3        A        C        -0.7      0
    3   sample_1  serum_1      -1          2.3        A        G         3.0      0
    4   sample_1  serum_1  (HA2)1          0.1        C        A         0.0      2
    5   sample_1  serum_1  (HA2)1          0.1        C        G         0.1      2
    6   sample_2  serum_1  (HA2)1          9.1        C        A         9.0      2
    7   sample_2  serum_1  (HA2)1          9.1        C        G         0.1      2
    8   sample_2  serum_1       1          1.2        C        A         1.1      1
    9   sample_2  serum_1       1          1.2        C        G         0.1      1
    10  sample_2  serum_1      -1          0.3        A        C        -0.2      0
    11  sample_2  serum_1      -1          0.3        A        G         0.5      0
    """
    if not len(df):
        raise ValueError('`df` has no rows')

    df_cols = set(df.columns)
    if 'dummy' in df_cols:
        raise ValueError('`df` has column named "dummy"')

    if not (set(filecols) <= df_cols):
        raise ValueError('`df` does not have all the `filecol` columns')

    df_filecols = []
    for row in df.iterrows():
        # get data frame of just row, with a dummy column for merging
        row_df = row[1].to_frame().transpose().assign(dummy=1)
        for col in filecols:
            filename = row_df.at[row[0], col]
            file_df = pandas.read_csv(filename).assign(dummy=1)
            if order_sites and 'site' not in file_df.columns:
                raise ValueError(f"no `site` column in {filename}")
            sharedcols = set(file_df.columns).intersection(df_cols)
            if sharedcols:
                raise ValueError(f"`df` and {filename} share columns "
                                 f"{sharedcols}")
            row_df = row_df.merge(file_df)
        df_filecols.append(row_df)

    df_filecols = (pandas.concat(df_filecols,
                                 ignore_index=True).drop('dummy',
                                                         axis='columns'))

    if order_sites:
        sites = natsort.realsorted(df_filecols['site'].unique())
        df_filecols = (df_filecols.assign(
            site=lambda x: pandas.Categorical(x['site'], sites, ordered=True),
            isite=lambda x: x['site'].cat.codes))

    return df_filecols
Exemplo n.º 15
0
    u1 = u1[s ^ 1::2]
    w1 = w1[s ^ 1::2]
    return u0, w0, u1, w1


def get_token(f, token):
    return f.split(token)[-1].split('_')[0]


def main(f):
    if '_F' in f:
        om, al = [get_token(f, token) for token in ['B', 'F']]
    else:
        om, al = [get_token(f, token) for token in ['_o', '_a']]
    u0, w0, u1, w1 = get_data(f)
    figure(1, figsize=(8, 8))
    clf()
    plot(u0, w0, 'o', mec='none', mfc='k', ms=1)
    plot(u1, w1, 'o', mec='none', mfc='r', ms=1)
    xlabel(r'$u$')
    ylabel(r'$w$')
    title(r'$(\omega,\alpha)=$' + f'({om:s},{al:s})', y=1.10)
    savefig(f'fig/space_time_monitor/o{om:s}_a{al:s}.png')
    print(f'plotted: {f:s}')
    return None


G = natsort.realsorted(glob.glob(sys.argv[1]))
for g in G:
    main(g)
Exemplo n.º 16
0
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from gesture.config import *
from natsort import natsorted,realsorted
from common_plot import barplot_annotate_brackets

top5_sid=[4,10,13,29,41]
depths=[1,2,3,4,5,6]
data_dir = '/Users/long/Documents/data/gesture/'# temp data dir
training_result_dir=data_dir+'training_result/dl_depth/'

accuracy_all=[]
for sid in top5_sid:
    sid_acc=[]
    tmp = realsorted([pth for pth in Path(training_result_dir+str(sid)).iterdir() if pth.suffix == '.npy' and 'changeDepth' in str(pth)])
    for depth in depths:
        result = np.load(str(tmp[depth-1]), allow_pickle=True).item()
        sid_acc.append(result['test_acc'])
    accuracy_all.append(sid_acc)
# perform best at depth=1

from matplotlib.patches import Patch
colors=['orangered','yellow', 'gold','orange','springgreen','aquamarine']#,'skyblue']
depth_label=[(str(i)+' layer') if i==1 else (str(i)+' layers') for i in depths]
cmap = dict(zip([str(i) for i in depth_label], colors))
patches = [Patch(color=v, label=k) for k, v in cmap.items()]
fig,ax=plt.subplots()
x=[1,2,3,4,5,6] # 6 depths
#BUG
accuracy_all_bug=np.asarray(accuracy_all)
Exemplo n.º 17
0
def test_realsorted_returns_results_identical_to_natsorted_with_REAL():
    a = ['a50', 'a51.', 'a50.31', 'a-50', 'a50.4', 'a5.034e1', 'a50.300']
    assert realsorted(a) == natsorted(a, alg=ns.REAL)
Exemplo n.º 18
0
def test_realsorted_is_identical_to_natsorted_with_real_alg(float_list):
    assert realsorted(float_list) == natsorted(float_list, alg=ns.REAL)
Exemplo n.º 19
0
def test_realsorted_returns_results_identical_to_natsorted_with_REAL():
    a = ['a50', 'a51.', 'a50.31', 'a-50', 'a50.4', 'a5.034e1', 'a50.300']
    assert realsorted(a) == natsorted(a, alg=ns.REAL)
Exemplo n.º 20
0
def comparePrefs(prefs1,
                 prefs2,
                 sites=None,
                 distmetric='half_sum_abs_diff',
                 chars=dms_tools2.AAS):
    """Compute error-corrected distance between two sets of preferences.
  
    Designed for the situation in which you have made replicate
    measurements of the amino-acid preferences for two protein
    homologs, and want to estimate the difference in preferences
    at each site while correcting for experimental error as
    quantified by the replicate measurements.

    The *distance* between each pair of replicates at each
    site is computed using `prefDistance` with `distmetric`.
    We then compute the RMS distance between all pairs
    for the same homolog to get `RMSDwithin`, and all pairs
    of different homologs to get `RMSDbetween`. We calculate
    `RMSDcorrected` as `RMSDbetween - RMSDwithin`.

    We also compute the mean (across replicates) preference
    for homolog 1 minus the mean for homolog 2, scaled so
    that the total height in each direction equals `RMSDcorrected`.
    These values are an error-corrected estimate of the difference
    in preference for each amino acid between homologs.

    Args:
        `prefs1` (list)
            Files giving replicate measurements of preferences for
            homolog 1 in the CSV format returned by ``dms2_prefs``.
        `prefs2` (list)
            Files giving measurements for homolog 2.
        `sites` (list or `None`)
            If `None`, compare all sites shared between the two
            homolog preference sites. Otherwise should be a list
            of the sites to compare.
        `distmetric` (string)
            Distance metric to use. Can be any valid option for
            the argument of the same name to `prefDistance`.
        `chars` (list)
            List of characters for which we analyze the preferences.
            For instance, all 20 amino acids.
    
    Returns:
        A `pandas.DataFrame` giving the distances at each site,
        as well as the replicate mean difference between 
        preferences for homolog 1 minus homolog 2 for each amino
        acid at each site scaled to height of `RMSDcorrected`
        in each direction.

    Example calculation for two character sequences and two
    replicates for each homolog:

    >>> TF = functools.partial(tempfile.NamedTemporaryFile, mode='w')
    >>> with TF() as p1_1, TF() as p1_2, TF() as p2_1, TF() as p2_2:
    ...     n = p1_1.write('''site,    A,    C
    ...                          1,  0.8,  0.2
    ...                          2,  0.3,  0.7'''.replace(' ', ''))
    ...     p1_1.flush()
    ...     n = p1_2.write('''site,    A,    C
    ...                          1,  0.8,  0.2
    ...                          2,  0.4,  0.6'''.replace(' ', ''))
    ...     p1_2.flush()
    ...     n = p2_1.write('''site,    A,    C
    ...                          2,  0.4,  0.6
    ...                          1,  0.6,  0.4'''.replace(' ', ''))
    ...     p2_1.flush()
    ...     n = p2_2.write('''site,    A,    C
    ...                          1,  0.6,  0.4
    ...                         1a,  0.4,  0.6
    ...                          2,  0.5,  0.5'''.replace(' ', ''))
    ...     p2_2.flush()
    ...     diffs = comparePrefs([p1_1.name, p1_2.name],
    ...                          [p2_1.name, p2_2.name],
    ...                          chars=['A', 'C'])
    >>> print(diffs.to_string(float_format=lambda x: '{0:.2f}'.format(x)))
      site  RMSDcorrected  RMSDbetween  RMSDwithin     A     C
    0    1           0.20         0.20        0.00  0.20 -0.20
    1    2           0.02         0.12        0.10 -0.02  0.02
    """

    assert len(prefs1) > 1, "provide prefs for multiple replicates"
    assert len(prefs2) > 1, "provide prefs for multiple replicates"

    # read in all preferences
    prefs = []
    expectcols = ['site'] + chars
    for (homolog, homologprefs) in enumerate([prefs1, prefs2], 1):
        for (rep, repprefs) in enumerate(homologprefs, 1):
            iprefs = pandas.read_csv(repprefs)
            iprefs['site'] = iprefs['site'].astype('str')
            assert set(iprefs.columns) <= set(expectcols), \
                    "{0} missing expected columns".format(repprefs)
            prefs.append(iprefs[expectcols].assign(homolog=homolog,
                                                   replicate=rep))

    # get only desired sites
    if sites is None:
        # use sites shared among all preference sets
        sites = list(set.intersection(*[set(p['site']) for p in prefs]))
    assert isinstance(sites, list) and len(sites), "no `sites` to analyze"
    sites = natsort.realsorted(list(map(str, sites)))

    # merge preferences for desired sites
    assert all([set(p['site']) >= set(sites) for p in prefs]),\
            "not all prefs have all sites"
    prefs = [p[p['site'].isin(sites)] for p in prefs]
    prefs = pandas.concat(prefs)
    prefs['site'] = pandas.Categorical(prefs['site'], sites)
    prefs = prefs.sort_values('site').set_index('site')

    # compute RMSDs
    dists = {'within': [], 'between': []}
    for ((hi, repi), (hj, repj)) in itertools.combinations(
        [(h, rep) for h in [1, 2]
         for rep in prefs.query('homolog == @h')['replicate'].unique()], 2):
        prefsi = (prefs.query('homolog == @hi and replicate == @repi')[chars])
        prefsj = (prefs.query('homolog == @hj and replicate == @repj')[chars])
        assert prefsi.index.equals(prefsj.index)
        disttype = {True: 'within', False: 'between'}[hi == hj]
        dists[disttype].append([
            prefDistance(prefsi.loc[r], prefsj.loc[r], distmetric)
            for r in sites
        ])
    for (disttype, dist) in dists.items():
        distseries = (pandas.DataFrame(dist, columns=sites).transpose().apply(
            computeRMS, axis=1))
        prefs['RMSD' + disttype] = distseries
    prefs['RMSDcorrected'] = prefs['RMSDbetween'] - prefs['RMSDwithin']
    rmsds = ['RMSDcorrected', 'RMSDbetween', 'RMSDwithin']

    # compute RMSDcorrected-scaled diff between homologs for each pref
    prefmeans = {}
    for homolog in [1, 2]:
        prefmeans[homolog] = (prefs.reset_index().query(
            'homolog == @homolog').groupby('site')[chars].mean())
    prefs = prefs[~prefs.index.duplicated(keep='first')][rmsds]
    dprefs = prefmeans[1] - prefmeans[2]
    # normalize so sums to one in each direction
    dprefs = dprefs.div(dprefs.abs().sum(axis=1), axis=0).mul(2).fillna(0)
    dprefs = dprefs.mul(prefs['RMSDcorrected'], axis=0)
    prefs = prefs.join(dprefs)

    return prefs[rmsds + chars].reset_index()
Exemplo n.º 21
0
def graphInterval(*,
                  data: dict,
                  title: str = None,
                  xLabel: str = None,
                  yLabel: str = None,
                  gridLines: str = "",
                  groupNames: tuple = (),
                  colorIndex: int = None,
                  show: bool = False):

    _, ax = plt.subplots()

    if len(groupNames) > 1:
        groupNames = (":\n".join(reversed(groupNames))) + ":"
        plt.text(-.015,
                 -.02,
                 s=groupNames,
                 horizontalalignment="right",
                 verticalalignment="top",
                 transform=ax.transAxes)

    sortedData = natsort.realsorted(data.items(), key=lambda t: t[0])

    colorCatX = defaultdict(list)
    colorCatY = defaultdict(list)

    for key, value in sortedData:

        inverseKey = "\n".join(reversed(key.split("\n")))

        if colorIndex != None:
            colorCatX[key.split("\n")[colorIndex]].append(inverseKey)
            colorCatY[key.split("\n")[colorIndex]].append(value[1])
        else:
            colorCatX[key.split("\n")[0]].append(inverseKey)
            colorCatY[key.split("\n")[0]].append(value[1])

        if key in data:

            if value[0] != None:
                plt.scatter(inverseKey, value[0], marker="_", color="black")
            if value[2] != None:
                plt.scatter(inverseKey, value[2], marker="_", color="black")
            if value[0] != None and value[2] != None:
                plt.plot([inverseKey] * 3, [value[0], value[1], value[2]],
                         linewidth=.85,
                         color="black")

    if colorIndex != None:
        for key, _ in colorCatX.items():
            plt.scatter(colorCatX[key], colorCatY[key], label=key)
            plt.legend(loc="best").set_draggable(True)
    else:
        for key, _ in colorCatX.items():
            plt.scatter(colorCatX[key], colorCatY[key], color="blue")

    plt.title(title)
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)

    if len(gridLines) == 1:
        plt.grid(which="major", axis=gridLines)
    elif gridLines == "xy":
        plt.grid(which="major", axis="both")

    plt.tight_layout()

    if show:
        plt.show()
    else:
        return plt.gcf()
Exemplo n.º 22
0
def get_paternNames(f):
    glob_str = pat_str = f
    G = glob.glob(pat_str)
    F = natsort.realsorted(G)
    return F
Exemplo n.º 23
0
def test_realsorted_returns_results_identical_to_natsorted_with_REAL():
    a = ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"]
    assert realsorted(a) == natsorted(a, alg=ns.REAL)
Exemplo n.º 24
0
def sort_lines(input_lines):
    return realsorted(input_lines)
Exemplo n.º 25
0
def readAsort(filename):
    f = glob.glob(filename)
    f = natsort.realsorted(f)
    return f
Exemplo n.º 26
0
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  8 16:35:35 2019

@author: c3216945
"""

import pyvista as pv
import numpy as np
import glob
import natsort

f = glob.glob('*.vtk')
f = natsort.realsorted(f)


def getSlugUz(filename, sampleSize):
    # read vtk as unstructured data
    data = pv.read(filename)
    aw = data.point_arrays['alpha.water']
    pts = data.points

    # get alpha.water on the middle line
    aw = aw[(np.abs(pts[:, 0]) <= 1e-4) & (np.abs(pts[:, 1]) <= 1e-4)]
    pts = pts[(np.abs(pts[:, 0]) <= 1e-4) & (np.abs(pts[:, 1]) <= 1e-4)]
    # keep index
    ind = np.arange(len(aw)).reshape(len(aw), 1)
    n1 = np.hstack((aw.reshape(len(aw), 1), pts))
    n1 = np.hstack((n1, ind))
    # sort based on z
    n1 = n1[n1[:, 3].argsort()]
Exemplo n.º 27
0
def test_realsorted_returns_results_identical_to_natsorted():
    a = ['a50', 'a51.', 'a50.31', 'a50.4', 'a5.034e1', 'a50.300']
    assert realsorted(a) == natsorted(a)
Exemplo n.º 28
0
def worker_user(params):
    """
    Worker for calculating metrics for one user.
    Uses only one core.
    """
    (user_idx, output_dir, test_batches, RQ_cap_adjust, for_epoch, verbose,
     only_rl, algo_feed, algo_frac, merge_sinks) = params

    save_dir = os.path.join(output_dir, save_dir_tmpl.format(user_idx))

    if verbose:
        print('Working on user_idx: {}'.format(user_idx))

    with open(os.path.join(save_dir, 'user_opt_dict.dill'), 'rb') as f:
        user_opt_dict = dill.load(f)

    one_user_data = user_data[user_idx]
    if merge_sinks:
        if verbose:
            print('Merged sinks!')
        one_user_data = RDU.merge_sinks(one_user_data)

    ret = {
        'user_idx': user_idx,
        'user_id': one_user_data['user_id'],
        'num_other_posts': one_user_data['num_other_posts'],
        'num_own_posts': one_user_data['num_user_events'],
        'num_followees': one_user_data['num_followees'],
        'duration': one_user_data['duration'],
        'num_followers': user_opt_dict['num_followers'],
        'N': user_opt_dict['N'],
        'reward_kind': user_opt_dict['trainer_opts_dict']['reward_kind'],
        'for_epoch': for_epoch,
        'num_batches': test_batches,
        'algo_feed': algo_feed,
    }

    window_start, eval_sim_opts = EB.make_real_data_batch_sim_opts(
        one_user_data=one_user_data,
        N=user_opt_dict['N'],
        seed=-1,
        is_test=True)
    ret['window_start'] = window_start
    ret['window_end'] = eval_sim_opts.end_time

    # The file names are of the form `*/tpprl.ckpt-<num>.meta`.
    # Hence, the number is interpreted as negative.
    # So to extract the last checkpoint, we do a sort by real-values and
    # pick the most negative value.
    # Also, we drop the `.meta` suffix.

    if for_epoch < 0:
        all_chpt_file = glob.glob(os.path.join(save_dir, '*.meta'))
        if len(all_chpt_file) == 0:
            if verbose:
                print('No chpt files found for {}.'.format(user_idx))
            return ret

        chosen_chpt_file = natsort.realsorted(all_chpt_file)[0][:-5]
    else:
        chosen_chpt_file = os.path.join(save_dir,
                                        'tpprl.ckpt-{}'.format(for_epoch))

    if verbose:
        print('chosen_chpt_file = ', chosen_chpt_file)

    ret['chpt_file'] = chosen_chpt_file
    if not os.path.exists(chosen_chpt_file + '.meta'):
        ret['error'] = 'File Not Found: {}.'.format(chosen_chpt_file)
        return ret

    rl_b_dict = EB.rl_b_dict_from_chpt(
        # '/NL/crowdjudged/work/rl-broadcast/r_2-sim-opt-fix/train-save-user_idx-218/tpprl.ckpt-898',
        chosen_chpt_file,
        one_user_data=one_user_data,
        window_start=window_start,
        user_opt_dict=user_opt_dict)

    sink_ids = one_user_data['sim_opts'].sink_ids
    if algo_feed:
        algo_c = user_opt_dict['algo_c']
        lifetimes = defaultdict(lambda:
                                (eval_sim_opts.end_time - window_start) / 10.)
        # algo_feed_args = ES.make_prefs(sink_ids, src_ids, seed=algo_feed_seed,
        #                                src_lifetime_dict=lifetimes)
        algo_feed_args = ES.make_freq_prefs(one_user_data=one_user_data,
                                            sink_ids=sink_ids,
                                            src_lifetime_dict=lifetimes)

        rl_b_dict['algo_feed'] = algo_feed
        rl_b_dict['algo_feed_args'] = algo_feed_args
        rl_b_dict['algo_c'] = algo_c
        rl_b_dict['t_min'] = window_start

    # This is the "K" in top-K
    K = 1

    if 'q' in user_opt_dict:
        q = user_opt_dict['q']
    else:
        warnings.warn('Setting q manually.')
        reward_kind = user_opt_dict['trainer_opts_dict']['reward_kind']
        if reward_kind == 'r_2_reward':
            q = 100.0
        elif reward_kind == 'top_k_reward':
            q = 1.0

    init_seed = 865
    rl_dfs = []
    rl_events = []
    rl_u_2 = []

    for idx in range(test_batches):
        mgr, exp_b = EB.get_real_data_mgr_chpt_np(rl_b_dict,
                                                  t_min=window_start,
                                                  batch_sim_opt=eval_sim_opts,
                                                  seed=init_seed + idx,
                                                  with_broadcaster=True)
        mgr.run_dynamic(max_events=MAX_EVENTS)
        rl_dfs.append(mgr.get_state().get_dataframe())
        rl_events.append(mgr.state.events)

        # Calculating the u^2 loss
        c_is = exp_b.get_all_c_is()
        time_deltas = exp_b.get_all_time_deltas()
        rl_u_2.append(exp_b.exp_sampler.calc_quad_loss(time_deltas, c_is))

    num_tweets = [
        RU.num_tweets_of(df, broadcaster_id=eval_sim_opts.src_id)
        for df in rl_dfs
    ]
    capacity_cap, capacity_std = np.mean(num_tweets), np.std(num_tweets)

    ret['capacity'] = capacity_cap
    ret['capacity_std'] = capacity_std

    ret['RL_u_2_mean'] = np.mean(rl_u_2)
    ret['RL_u_2_std'] = np.std(rl_u_2)

    if not only_rl:
        # Figure out what 'q' to use for RQ to get the same number of tweets.
        # Removing 'RQ_cap_adjust' because RQ systematically tweets more.
        q_RQ = RU.sweep_q(eval_sim_opts,
                          capacity_cap=capacity_cap - RQ_cap_adjust,
                          verbose=verbose,
                          q_init=q,
                          parallel=False,
                          max_events=MAX_EVENTS,
                          max_iters=MAX_ITERS,
                          only_tol=True,
                          tol=0.1)
        ret['q_RQ'] = q_RQ

        # Run RedQueen.
        RQ_dfs = []
        RQ_events = []
        for idx in range(test_batches):
            # Deliberately using eval_sim_opts.s, as it was used to calculate q_RQ.
            # It seems to be initialized to constant (equal significance).
            opt = OM.Opt(src_id=eval_sim_opts.src_id,
                         s=eval_sim_opts.s,
                         seed=init_seed + idx,
                         q=q_RQ)
            mgr = eval_sim_opts.update({
                'q': q_RQ
            }).create_manager_with_broadcaster(opt)
            # mgr = eval_sim_opts.update({}).create_manager_with_opt(seed=init_seed + idx)
            mgr.state.time = window_start
            mgr.run_dynamic(max_events=MAX_EVENTS)
            RQ_dfs.append(mgr.get_state().get_dataframe())
            RQ_events.append(opt.state.events)

        if algo_feed:
            # Figure out what 'q' to use for RQ to get the same number of tweets.
            # Removing 'RQ_cap_adjust' because RQ systematically tweets more.
            q_RQ_algo = ES.sweep_q_algo(
                sim_opts=eval_sim_opts,
                capacity_cap=capacity_cap - RQ_cap_adjust,
                algo_feed_args=algo_feed_args,
                algo_c=algo_c,
                verbose=verbose,
                q_init=1000.0,
                max_events=MAX_EVENTS,
                max_iters=MAX_ITERS,
                tol=0.1,
                only_tol=True,
                t_min=window_start,
            )
            ret['q_RQ_algo'] = q_RQ_algo

            # Run RedQueen heuristic.
            RQ_algo_dfs = []
            RQ_algo_events = []
            for idx in range(test_batches):
                # Deliberately not using eval_sim_opts.s, it seems to be initialized to
                # something strange.
                opt = ES.OptAlgo(src_id=eval_sim_opts.src_id,
                                 seed=init_seed + idx,
                                 q=q_RQ_algo,
                                 algo_feed_args=algo_feed_args,
                                 algo_c=algo_c)
                mgr = eval_sim_opts.update({
                    'q': q_RQ_algo
                }).create_manager_with_broadcaster(opt)
                # mgr = eval_sim_opts.update({}).create_manager_with_opt(seed=init_seed + idx)
                mgr.state.time = window_start
                mgr.run_dynamic(max_events=MAX_EVENTS)
                RQ_algo_dfs.append(mgr.get_state().get_dataframe())
                RQ_algo_events.append(opt.state.events)

        # Run Poisson.
        poisson_dfs = []
        poisson_events = []
        rate = capacity_cap / (eval_sim_opts.end_time - window_start)
        for idx in range(test_batches):
            poisson = OM.Poisson2(src_id=eval_sim_opts.src_id,
                                  seed=init_seed + idx,
                                  rate=rate)
            mgr = eval_sim_opts.create_manager_with_broadcaster(poisson)
            mgr.state.time = window_start
            mgr.run_dynamic(max_events=MAX_EVENTS)
            poisson_dfs.append(mgr.get_state().get_dataframe())
            poisson_events.append(mgr.get_state().events)

        # Running Karimi
        T = eval_sim_opts.end_time - window_start
        num_segments = 10
        seg_len = T / num_segments
        wall_mgr = eval_sim_opts.create_manager_for_wall()
        wall_mgr.run_dynamic(max_events=MAX_EVENTS)
        wall_df = wall_mgr.state.get_dataframe()

        ret['num_segments'] = num_segments
        ret['num_wall_tweets'] = wall_df.event_id.nunique()

        seg_idx = ((wall_df.t.values - window_start) / T *
                   num_segments).astype(int)
        intensity_df = (wall_df.groupby(
            ['sink_id', pd.Series(seg_idx, name='segment')]).size() /
                        (T / num_segments)).reset_index(name='intensity')
        wall_intensities_df = intensity_df.pivot_table(
            values='intensity', index='sink_id', columns='segment').fillna(0)
        for seg_idx in range(num_segments):
            if seg_idx not in wall_intensities_df.columns:
                wall_intensities_df[seg_idx] = 0.0
        wall_intensities = wall_intensities_df[list(
            range(num_segments))].values

        # This is the single-threaded version
        params = (init_seed, capacity_cap, num_segments, eval_sim_opts,
                  wall_intensities, None)
        op = OR.worker_kdd(params,
                           verbose=verbose,
                           Ks=[K],
                           window_start=window_start)

        karimi_dfs = []
        karimi_events = []
        for idx in range(test_batches):
            piecewise = OM.PiecewiseConst(
                src_id=eval_sim_opts.src_id,
                seed=init_seed * 2 + idx,
                change_times=window_start + np.arange(num_segments) * seg_len,
                rates=op['kdd_opt_{}'.format(K)] / seg_len)
            piecewise_const_mgr = eval_sim_opts.create_manager_with_broadcaster(
                piecewise)
            piecewise_const_mgr.state.time = window_start
            piecewise_const_mgr.run_dynamic(max_events=MAX_EVENTS)
            df = piecewise_const_mgr.state.get_dataframe()
            karimi_dfs.append(df)
            karimi_events.append(piecewise_const_mgr.get_state().events)

    # Calculating metrics
    if only_rl:
        all_settings = [('RL', rl_dfs)]
    else:
        all_settings = [('RL', rl_dfs), ('RQ', RQ_dfs),
                        ('poisson', poisson_dfs), ('karimi', karimi_dfs)]

        if algo_feed:
            all_settings += [('RQ_algo', RQ_algo_dfs)]

    metric_name = 'num_tweets'
    for type, dfs in all_settings:
        metric = [
            RU.num_tweets_of(df, broadcaster_id=eval_sim_opts.src_id)
            for df in dfs
        ]
        ret[type + '_' + metric_name +
            '_mean'], ret[type + '_' + metric_name +
                          '_std'] = (np.mean(metric), np.std(metric))

    metric_name = 'top_k'
    for type, dfs in all_settings:
        metric = [
            RU.time_in_top_k(df, K=K, sim_opts=eval_sim_opts) for df in dfs
        ]
        ret[type + '_' + metric_name +
            '_mean'], ret[type + '_' + metric_name +
                          '_std'] = (np.mean(metric), np.std(metric))

    metric_name = 'avg_rank'
    for type, dfs in all_settings:
        metric = [RU.average_rank(df, sim_opts=eval_sim_opts) for df in dfs]
        ret[type + '_' + metric_name +
            '_mean'], ret[type + '_' + metric_name +
                          '_std'] = (np.mean(metric), np.std(metric))

    metric_name = 'r_2'
    for type, dfs in all_settings:
        metric = [RU.int_r_2(df, sim_opts=eval_sim_opts) for df in dfs]
        ret[type + '_' + metric_name +
            '_mean'], ret[type + '_' + metric_name +
                          '_std'] = (np.mean(metric), np.std(metric))

    if algo_feed:
        if only_rl:
            all_settings = [('RL', rl_events)]
        else:
            all_settings = [('RL', rl_events), ('RQ', RQ_events),
                            ('poisson', poisson_events),
                            ('karimi', karimi_events)]
            if algo_feed:
                all_settings += [('RQ_algo', RQ_algo_events)]

        for type, all_events in all_settings:
            r_2_algo = []
            r_algo = []
            top_k_algo = []

            for events in all_events:
                # Calculate some metrics here itself.
                times, r_2 = ES.algo_true_rank(sink_ids=sink_ids,
                                               src_id=eval_sim_opts.src_id,
                                               events=events,
                                               start_time=window_start,
                                               end_time=eval_sim_opts.end_time,
                                               steps=REWARD_STEPS,
                                               all_prefs=algo_feed_args,
                                               square=True,
                                               c=algo_c)
                r_2_algo.append(np.sum(r_2) * (times[1] - times[0]))

                times, ranks = ES.algo_true_rank(
                    sink_ids=sink_ids,
                    src_id=eval_sim_opts.src_id,
                    events=events,
                    start_time=window_start,
                    end_time=eval_sim_opts.end_time,
                    steps=REWARD_STEPS,
                    all_prefs=algo_feed_args,
                    square=False,
                    c=algo_c)
                r_algo.append(np.sum(ranks) * (times[1] - times[0]))

                times, top_ks = ES.algo_top_k(sink_ids=sink_ids,
                                              src_id=eval_sim_opts.src_id,
                                              events=events,
                                              start_time=window_start,
                                              end_time=eval_sim_opts.end_time,
                                              K=K,
                                              steps=REWARD_STEPS,
                                              all_prefs=algo_feed_args,
                                              c=algo_c)
                top_k_algo.append(np.sum(top_ks) * (times[1] - times[0]))

            ret[type + '_r_2_algo_mean'] = np.mean(r_2_algo)
            ret[type + '_r_2_algo_std'] = np.std(r_2_algo)

            ret[type + '_avg_rank_algo_mean'] = np.mean(r_algo)
            ret[type + '_avg_rank_algo_std'] = np.std(r_algo)

            ret[type + '_top_k_algo_mean'] = np.mean(top_k_algo)
            ret[type + '_top_k_algo_std'] = np.std(top_k_algo)

    return ret
Exemplo n.º 29
0
save_dir = data_dir + 'training_result/compare_result/'

filename = data_dir + 'info/Info.npy'
info = np.load(filename, allow_pickle=True)
sids = info[:, 0]

savefile = data_dir + 'tfAnalysis/ERSD_activation.npy'
ersd_corr = np.load(savefile, allow_pickle=True).item(
)  # ersd_corr: dict with key=sid. ersd['sid'].shape=(2,channel number)

training_result_dir = data_dir + 'training_result/'
model_name = ['eegnet', 'shallowFBCSPnet', 'deepnet', 'deepnet_da', 'resnet']
decoding_accuracy = []
results_path = realsorted([
    str(pth) for pth in Path(training_result_dir + 'deepLearning/').iterdir()
    if 'DS_Store' not in str(pth) and 'pdf' not in str(pth)
])  # if pth.suffix == '.npy']

for i, modeli in enumerate(model_name):
    decoding_accuracy.append([])
    for path in results_path:
        path = str(path)
        result_file = path + '/training_result_' + modeli + '.npy'
        result = np.load(result_file, allow_pickle=True).item()

        train_losses = result['train_losses']
        train_accs = result['train_accs']
        val_accs = result['val_accs']
        test_acc = result['test_acc']
        # BUG
        if modeli == 'deepnet_da':