Exemplo n.º 1
0
    def __init__(self, parent, name, typ, n_rows=None):

        self.parent = parent
        self.n_rows = n_rows

        if isinstance(typ, string_types):
            import datetime
            m = dict(list(__builtins__.items()) + list(datetime.__dict__.items()))
            if typ == 'unknown':
                typ = binary_type
            else:
                typ = m[typ]

        from datetime import date, time, datetime

        self.is_gvid = bool('gvid' in name)  # A special name in Ambry
        self.is_geoid= bool('geoid' in name)  # A special name in Ambry
        self.is_year = bool('year' in name)
        self.is_time = typ == time
        self.is_date = typ == date or typ == datetime

        # Tricky hack, indexing with a bool.
        self.flags = " G"[self.is_gvid] + " Y"[self.is_year] + " T"[self.is_time] + " D"[self.is_date]

        if self.is_year or self.is_time or self.is_date:
            lom = StatSet.LOM.ORDINAL
        elif typ == binary_type or typ == text_type:
            lom = StatSet.LOM.NOMINAL
        elif typ == int or typ == float:
            lom = StatSet.LOM.INTERVAL
        else:
            lom = StatSet.LOM.NOMINAL

        self.column_name = name

        self.lom = lom
        self.n = 0
        self.counts = Counter()
        self.size = None
        self.stats = livestats.LiveStats([0.25, 0.5, 0.75])  # runstats.Statistics()

        self.bin_min = None
        self.bin_max = None
        self.bin_width = None
        self.bin_primer_count = 5000  # how many points to collect before creating hist bins

        self._hist_built = False

        self.num_bins = 16
        self.bins = [0] * self.num_bins
Exemplo n.º 2
0
    def _build_hist_bins(self):
        from math import sqrt

        if self._hist_built:
            return

        # If less than 1% are unique, assume that this number is actually an ordinal
        if self.nuniques < (self.n / 100):
            self.lom = self.LOM.ORDINAL
            self.stats = livestats.LiveStats()
        else:

            self.bin_min = self.stats.mean() - sqrt(self.stats.variance()) * 2
            self.bin_max = self.stats.mean() + sqrt(self.stats.variance()) * 2
            self.bin_width = (self.bin_max - self.bin_min) / self.num_bins

            if self.bin_width == 0:
                # I guess we just aren't getting a histogram.
                self._hist_build = True
                return

            # Puts the saved entries into the hist bins.
            def fill_bins():
                bins = [0] * self.num_bins
                for v, count in iteritems(self.counts):
                    float_v = _force_float(v)
                    if float_v >= self.bin_min and float_v <= self.bin_max and self.bin_width != 0:
                        bin_ = int((float_v - self.bin_min) / self.bin_width)
                        if bin_ < len(bins):
                            bins[bin_] += count
                return bins

            bins = fill_bins()

            # No, strip off all of the leftmost bins that have no value. This makes for prettier power
            # and exponential distributions, where the left skew of the mean leaves the left side of the
            # chart empty.
            first_non_zero = next((index for index, value in enumerate(bins) if value != 0), None)

            if first_non_zero:
                self.bin_min = self.bin_min + self.bin_width*first_non_zero
                self.bin_width = (self.bin_max - self.bin_min) / self.num_bins

            self.bins = fill_bins()

        # self.counts = Counter()
        self._hist_build = True
 def load_models(self, features):
     feature_stats = {}
     for feature in features:
         try:
             model_path = os.path.join(base_path, "Models")
             name_string = self.subject_name + "_" + feature
             path = max([
                 os.path.join(model_path, name)
                 for name in os.listdir(model_path) if name_string in name
             ],
                        key=os.path.getctime)
             model_file = open(path, "rb")
             feature_stats[feature] = pickle.load(model_file)
             model_file.close()
             logging.info(
                 "Processing - Loaded model for %s. Mean %.2f, Std %.2f" %
                 (feature, feature_stats[feature].mean(),
                  np.sqrt(feature_stats[feature].variance())))
         except:
             logging.warn(
                 "Processing - Could not load model from file. Create new model for %s"
                 % feature)
             feature_stats[feature] = livestats.LiveStats([0.10, 0.5, 0.90])
     return feature_stats
Exemplo n.º 4
0
 def ffold(map_, p):
     x, y = p
     if x not in map_:
         map_[x] = livestats.LiveStats(quantiles)
     map_[x].add(y)
     return map_
Exemplo n.º 5
0
    def __init__(self,
                 env,
                 num_users,
                 name,
                 weighted_svcs,
                 min_think_time,
                 max_think_time,
                 quantiles=None,
                 svc_req_log=None):
        # type: (simpy.Environment, Union[int, Sequence[Tuple[float, int]]], str, Sequence[Tuple[SvcRequester, float]], float, float, Optional[Sequence[float]], Optional[MutableSequence[Tuple[str, SvcRequest]]]) -> None
        """Initializer.

        Args:
            env: The Simpy Environment.
            num_users: Number of users in group.  This can be either a
                positive integer or a sequence of (float, int), where
                the floats are monotonically increasing.  In this case,
                the sequence represents a step function of time, where each pair
                represents a step whose range of *x* values extend from the
                first component of the pair (inclusive) to the first
                component of the next pair (exclusive), and whose *y* value
                is the second component of the pair.  The first pair in
                the sequence must have 0 as its first component.
                If the num_users argument is an int, it is transformed
                into the list [(0, num_users)].
            name: This user group's name.
            weighted_svcs: List of pairs of
                SvcRequester instances and positive numbers
                representing the different service request types issued by
                the users in the group and their weights.  The weights are
                the relative frequencies with which the service requesters
                will be executed (the weights do not need to add up to 1,
                as they are normalized by this class).
            min_think_time: The minimum think time between service
                requests from a user.  Think time will be uniformly
                distributed between min_think_time and max_think_time.
            max_think_time: The maximum think time between service
                requests from a user.  Think time will be uniformly
                distributed between min_think_time and max_think_time.
            quantiles: List of quantiles to be tallied.  It
                defaults to [0.5, 0.95, 0.99] if not provided.
            svc_req_log: If not None, a sequence where service requests will
                be logged.  Each log entry is a pair (name, svc_req), where
                name is this group's name and svc_req is the current
                service request generated by this group.
        """
        self.env = env
        if isinstance(num_users, int):
            num_users = [(0, num_users)]
        if not isinstance(num_users, list):
            raise TypeError(
                "Argument num_users must be a number or a list of pairs.")
        if not num_users[0][0] == 0:
            raise ValueError("Argument num_users first element must be a pair "
                             "with 0 as the first component.")
        self.num_users = num_users
        self._num_users_times = [p[0] for p in num_users][1:] + [self.INFINITY]
        self._num_users_values = [p[1] for p in num_users]
        self._max_users = max(self._num_users_values)
        self.name = name
        self.weighted_svcs = weighted_svcs
        self.svcs = [x[0] for x in weighted_svcs]
        self.min_think_time = min_think_time
        self.max_think_time = max_think_time
        if quantiles is None:
            quantiles = [0.5, 0.95, 0.99]
        self.quantiles = quantiles
        self.svc_req_log = svc_req_log

        self._pick_svc = prob_chooser(*weighted_svcs)

        # create Tally objects for response times: overall and by svcRequest
        self._tally_dict = {}  # map from svcRequest to tally
        for svc in self.svcs:
            self._tally_dict[svc] = livestats.LiveStats(quantiles)
        self._overall_tally = livestats.LiveStats(quantiles)  # overall tally
        self._tally_dict[None] = self._overall_tally

        # additional recordkeeping
        self._request_count_dict = {}
        for svc in self.svcs:
            self._request_count_dict[svc] = 0
        self._request_count_dict[None] = 0
Exemplo n.º 6
0
import os.path

DATA_DIR = "/home/sujit/Projects/med_data/cms_gov/outpatient_claims"
EPSILON = 0.0001


def compute_cutoff(level, cys):
    for i in range(len(cys), 0, -1):
        if cys[i - 1] < level:
            return i
    return -1


lns = 0
fin = open(os.path.join(DATA_DIR, "clusters.txt"), 'rb')
stats = livestats.LiveStats([0.25, 0.5, 0.75])
xs = []
for line in fin:
    #    if lns > 1000: break
    line = line.strip()
    lns += 1
    x = EPSILON if line == "nan" else float(line)
    xs.append(x)
fin.close()

counts, bins, ignored = plt.hist(xs, bins=100)
cumsums = np.cumsum(counts)
plt.plot(bins[:-1], cumsums, color='red')

max_cy = len(xs)
strong_xcut = compute_cutoff(0.99 * max_cy, cumsums) / len(bins)
Exemplo n.º 7
0
    pr.enable()

    N, N1 = CM.shape

    if N != N1:
        raise ValueError("S must be a square array (shape=%s)" %
                         repr(CM.shape))

    l = N / NPROCS
    r = N - l * NPROCS
    if r != 0:
        l = l
        N = N - r
        print 'Truncating matrix to NxN to fit on %d procs' % NPROCS

    med = livestats.LiveStats()
    madd = np.vectorize(med.add)

N = comm.bcast(N, root=0)
l = comm.bcast(l, root=0)

CMs = CM.id.get_space()
tCM = np.empty((N, ), dtype=np.float)

ms = h5s.create_simple((N, ))

tb, te = task(NPROCS - 1 - rank, l)

if rank == 0:
    te -= 1
Exemplo n.º 8
0
 def __init__(self):
     self._ls = livestats.LiveStats(
         [0.5])  # we are only interested in the median