예제 #1
0
파일: valid.py 프로젝트: julienmalard/Tikon
def reps_necesarias(matr, eje_parám, eje_estoc, frac_incert, confianza):
    n_parám = matr.shape[eje_parám]
    n_estoc = matr.shape[eje_estoc]

    otras_dims = [e for i, e in enumerate(matr.shape) if i != eje_estoc]

    n_iter = 100  # podría ser mejor a ~200
    matr_perc_estoc = np.zeros((*otras_dims, n_estoc - 1))
    for i in range(2, n_estoc + 1):
        rango = np.zeros((n_iter, *otras_dims))
        for j in range(n_iter):
            reps_e = np.random.choice(n_estoc, i, replace=False)
            matr_sel = np.take(matr, reps_e, axis=eje_estoc)
            prcntl = np.quantile(matr_sel, q=[(1 - frac_incert) / 2, 0.5 + frac_incert / 2], axis=eje_estoc)
            rango[j] = np.ptp(prcntl, axis=0)
        matr_perc_estoc[..., i - 2] = np.mean(rango, axis=0)

    x = 1 / np.arange(2, n_estoc + 1)
    a_0, b = _reg_lin(x, matr_perc_estoc, eje=-1)
    a = -1 / a_0
    req_n_estoc = np.ceil(np.nanmax(1 / (a * b * (1 - confianza))))

    if np.isnan(req_n_estoc):
        req_n_estoc = 1

    otras_dims = [e for i, e in enumerate(matr.shape) if i != eje_parám and i != eje_estoc]
    matr_perc_prm = np.zeros((*otras_dims, n_parám - 1))
    rango = np.zeros((n_iter, *otras_dims))
    for i in range(2, n_parám + 1):
        for j in range(n_iter):
            reps_e = np.random.choice(n_parám, i, replace=False)
            matr_sel = np.take(matr, reps_e, axis=eje_parám)
            prcntl = np.quantile(matr_sel, q=[(1 - frac_incert) / 2, 0.5 + frac_incert / 2],
                                 axis=(eje_parám, eje_estoc))
            rango[j] = np.ptp(prcntl, axis=0)
        matr_perc_prm[..., i - 2] = np.mean(rango, axis=0)

    x = 1 / np.arange(2, n_parám + 1)
    a_0, b = _reg_lin(x, matr_perc_prm, eje=-1)
    req_n_prm = np.ceil(np.nanmax(-a_0 / (b * (1 - confianza))))

    if np.isnan(req_n_prm):
        req_n_prm = 1
    return {'estoc': req_n_estoc, 'parám': req_n_prm}
예제 #2
0
파일: stats.py 프로젝트: RamyaGuru/matminer
 def quantile(data_lst, weights=None, q=0.5):
     """
     Return a specific quantile.
     Args:
         data_lst (list or np.ndarray): 1D data list to be used for computing
             quantiles
         q (float): The quantile, as a fraction between 0 and 1.
     Returns:
         (float) The computed quantile of the data_lst.
     """
     q = float(q)
     return np.quantile(data_lst, q=q)
예제 #3
0
파일: imgproc.py 프로젝트: davcrom/ixtract
def bihist_eq(frame, params):
    """Bi-histogram equalization based on Kim (1997), returns an equalized
    version of the input frame"""

    # global image histogram
    bins = np.arange(257)
    vals, bins = np.histogram(frame.ravel(), bins=bins)

    # probability density function
    pdf = vals / np.prod(frame.shape)

    sp = np.quantile(frame, params['eq_sp']) # separation point
    rp = 255*params['eq_rp'] # range point

    # upper histogram
    upper = vals[bins[1:] > sp]
    u_bins = np.arange(sp+1, 257)
    u_pdf = upper / (np.sum(upper) + np.finfo(float).eps)
    u_cdf = np.cumsum(u_pdf)
    upper_eq = rp + (255 - rp) * (u_cdf - 0.5 * u_pdf)

    # lower histogram
    lower = vals[bins[1:] <= sp]
    l_bins = np.arange(sp) + 1
    l_pdf = lower / (np.sum(lower) + np.finfo(float).eps)
    l_cdf = np.cumsum(l_pdf)
    lower_eq = rp  * (l_cdf - 0.5 * l_pdf)

    # intensity value look-up table
    eq_lut = np.concatenate((lower_eq, upper_eq))
    # equalized values
    eq_vals = [eq_lut[i] for i in frame.ravel()]
    # equalized frame
    eq_frame = np.reshape(eq_vals, frame.shape)

    return np.round(eq_frame).astype('uint8')
예제 #4
0
# model.compile(loss=HuberLoss(), optimizer =


# %%

# trying mc DROPOUT

 # force training mode = dropout on  # force training mode = dropout on
with keras.backend.learning_phase_scope(1):
    y_probas = np.stack([model.predict(X_test)
        for sample in range(100)])
    y_proba = y_probas.mean(axis=0)
X_test.shape
y_test
y_proba.T
y_proba.shape

y_proba.mean()
y_proba.std()


ci = 0.95
lower_lim = np.quantile(y_proba, 0.5-ci/2, axis=1)
upper_lim = np.quantile(y_proba, 0.5+ci/2, axis=1)

lower_lim
upper_lim

lower_lim==upper_lim
예제 #5
0
def q_99(arr: Union[pd.Series, np.ndarray]) -> float:
    arr = np.asarray(arr)
    return np.quantile(arr, 0.99)
예제 #6
0
    def train(self, target_model, X_mal_train, X_mal_test, X_good_train,
              X_good_test, mal_label=1, good_label=0, earlystop=False,
              zmin=0, zmax=1, epochs=500, batch_size=32, combined_d_batch=False,
              d_train_mal=False, d_train_adv=True, good_batch_factor=1,
              d_times=1, gan_times=1, n_progress=1, minTPR_threshold=0,
              max_changes=np.inf, gan_dir=GAN_DIR, smooth_alpha=1.0,
              sample_train=True):
        """
        Performs GAN training.
        :param target_model: The target model of the evasion attack
        :param X_mal_train: The malware training set
        :param X_mal_test: The malware test set
        :param X_good_train: The goodware training set
        :param X_good_test: The goodware test set
        :param mal_label: The label for the malware class (original label)
        :param good_label: The label for the goodware class (target label)
        :param zmin: The lower bound of the random noise
        :param zmax: The upper bound of the random noise
        :param epochs: The number of training epochs
        :param batch_size: The size of a training batch
        :param d_train_mal: Whether to train the disciminator on malware.
        :param combined_d_batch: Whether to train the discriminator on one batch
                  that combine all classes or train on each eparately
        :param good_batch_factor: The size ratio of a goodware batch compared
                  to that of a malware batch.
        :param d_times: The number of times to train the discriminator in each
                  iteration.
        :param gan_times: The number of times to train the GAN in each iteration
        :param n_progress: The number of epochs with no improvement/output after
                  which print ouput to check for progress.
        :param minTPR_threshold: The threshold to which we wish to minimise the
                  the True Positive Rate (TPR).
        :param max_changes: A constraint on the maximum number of changes in
                  generated adversarial examples (AEs)
        :return: tuple (
                    TPR_train: The list of TPR scores on the training set at
                                each epoch,
                    TPR_test: The list of TPR scores on the test set at each
                              epoch,
                    avg_diff_train: The list of avg changes in AEs generated
                              from training set at each epoch,
                    avg_diff_test: The list of avg changes in AEs generated
                              from the test set at each epoch,
                    d_metrics: The list of the discriminator metrics
                              [loss, accuracy] at each epoch,
                    gan_metrics: The list of the GAN metrics
                              [loss, accuracy] at each epoch,
                    best_G_path: The path to the best performing G model
                  )
        """

        g_batch_size = good_batch_factor * batch_size

        # Metrics accumulators
        d_metrics = []
        gan_metrics = []

        # Initial TPR on the training & test sets
        TPR_train = [target_model.score(X_mal_train,
                                        mal_label * ones(X_mal_train.shape[0]))]
        TPR_test = [target_model.score(X_mal_test,
                                       mal_label * ones(X_mal_test.shape[0]))]
        minTPR = 1.0
        minTPR_avg_changes = -1
        minTPR_max_changes = -1
        min_epoch = output_epoch = 0
        best_G_path = None

        print(f"Initial TPR on the training set: {TPR_train}")
        print(f"Initial TPR on the test set: {TPR_test}\n")

        # Average changes (perturbations) in adversarial examples
        avg_diff_train = []
        avg_diff_test = []

        # IDs for plots
        plot_id = 1
        gan_id = 1
        tpr_id = 1

        t1 = time.perf_counter()

        for epoch in range(epochs):
            # Generate batches of size (gan_times * batch_size)
            X_mal_batches = batch(X_mal_train, gan_times * batch_size,
                                  seed=epoch)
            # Epoch metrics accumulators
            d_metrics_epoch = np.empty((0, 2))
            gan_metrics_epoch = np.empty((0, 2))

            for X_mal_batch in X_mal_batches:
                ################################################################
                # Train the discriminator for d_times iterations
                ################################################################
                # Generate minibatches of size batch_size
                minibatches = batch(X_mal_batch, batch_size, seed=epoch)
                d_metrics_batch = np.empty((0, 2))
                # Train for d_times
                for i in range(d_times):
                    # __could reseed with (epoch + i) for reproducibility__
                    X_mal = next(minibatches, None)  # Use these batches first
                    if X_mal is None:  # Then generate randomly
                        X_mal = rand_batch(X_mal_train, batch_size)

                    Y_mal = smooth_alpha * mal_label * ones(
                        X_mal.shape[0])  # Smooth

                    noise = np.random.uniform(zmin, zmax,
                                              size=[batch_size, self.z_dim])

                    # Generate adversarial examples
                    X_adv = self.generator.predict([X_mal, noise])
                    X_adv = binarise(X_adv, self.bin_threshold)
                    Y_adv = target_model.predict(X_adv)
                    Y_adv[
                        Y_adv == mal_label] = smooth_alpha * mal_label  # Smooth

                    X_good = rand_batch(X_good_train, g_batch_size)
                    Y_good = good_label * ones(X_good.shape[0])     # Good_Label

                    # Train the discriminator
                    self.discriminator.trainable = True

                    if combined_d_batch:
                        # *** Train once on a combined batch ****
                        X = X_good
                        Y = Y_good
                        if d_train_mal:
                            X = np.concatenate((X, X_mal))
                            Y = np.concatenate((Y, Y_mal))
                        if d_train_adv:
                            X = np.concatenate((X, X_adv))
                            Y = np.concatenate((Y, Y_adv))
                        metrics = self.discriminator.train_on_batch(X, Y)
                    else:
                        # ** Train on separate batches & combine metrics **
                        metrics_good = self.discriminator.train_on_batch(X_good,
                                                                         Y_good)
                        metrics_mal = self.discriminator.train_on_batch(X_mal,
                                                                        Y_mal) \
                            if d_train_mal else [np.nan, np.nan]
                        metrics_adv = self.discriminator.train_on_batch(X_adv,
                                                                        Y_adv) \
                            if d_train_adv else [np.nan, np.nan]
                        # Avg metrics
                        metrics = np.nanmean(np.array([metrics_mal,
                                                       metrics_good,
                                                       metrics_adv]), axis=0)

                    # Accumulate metrics for d_times iterations
                    d_metrics_batch = np.vstack((d_metrics_batch, metrics))

                # Average the metrics of all d_times iterations
                d_metrics_batch = np.mean(d_metrics_batch, axis=0)
                # Add to discriminator metrics for this epoch
                d_metrics_epoch = np.vstack((d_metrics_epoch, metrics))

                ################################################################
                # Train the Generator
                ################################################################
                # Generate minibatches of size batch_size
                minibatches = batch(X_mal_batch, batch_size, seed=epoch)
                gan_metrics_batch = np.empty((0, 2))
                # Train for gan_times
                for i in range(gan_times):
                    # Number of minibatches should be exactly gan_times
                    X_mal = next(minibatches, None)
                    if X_mal is None:  # Just in case, generate randomly
                        X_mal = rand_batch(X_mal_train, batch_size)

                    noise = np.random.uniform(zmin, zmax, size=[batch_size,
                                                                self.z_dim])
                    self.discriminator.trainable = False

                    # Train with target label = GOOD_LABEL
                    metrics = self.GAN.train_on_batch([X_mal, noise],  # <<<<
                                                      good_label * ones(
                                                          X_mal.shape[0]))
                    # discriminator.trainable = True

                    # Accumulate metrics for gan_times iterations
                    gan_metrics_batch = np.vstack((gan_metrics_batch, metrics))

                # Average the metrics of all gan_times iterations
                gan_metrics_batch = np.mean(gan_metrics_batch, axis=0)
                # Add to the generator metrics for this epoch
                gan_metrics_epoch = np.vstack((gan_metrics_epoch, metrics))

            # Average metrics of each epoch
            d_metrics.append(np.mean(d_metrics_epoch, axis=0).tolist())
            gan_metrics.append(np.mean(gan_metrics_epoch, axis=0).tolist())
            gan_loss = gan_metrics[-1][0]

            # TPR on adversarial training set
            noise = np.random.uniform(zmin, zmax, (X_mal_train.shape[0],
                                                   self.z_dim))
            X_adv_train = binarise(self.generator.predict([X_mal_train, noise]),
                                   self.bin_threshold)
            # Score with target label = MAL_LABEL
            Y_adv_train = mal_label * ones(X_adv_train.shape[0])  # MAL_LABEL
            TPR = target_model.score(X_adv_train, Y_adv_train)
            TPR_train.append(TPR)

            # Changes (L1 norms) in the adversarial training set
            diff_train = norm((X_adv_train - X_mal_train), ord=1, axis=1)
            avg_diff_train_current = np.mean(diff_train)
            max_diff_train_current = np.max(diff_train)
            avg_diff_train.append(avg_diff_train_current)

            # TPR on adversarial test set
            noise = np.random.uniform(zmin, zmax, (X_mal_test.shape[0],
                                                   self.z_dim))

            X_adv_test = binarise(self.generator.predict([X_mal_test, noise]),
                                  self.bin_threshold)
            Y_adv_test = mal_label * ones(X_adv_test.shape[0])  # MAL_LABEL
            TPR = target_model.score(X_adv_test, Y_adv_test)
            TPR_test.append(TPR)

            # Changes (L1 norms) in the adversarial test set
            diff_test = norm((X_adv_test - X_mal_test), ord=1, axis=1)
            avg_diff_test_current = np.mean(diff_test)
            max_diff_test_current = np.max(diff_test)
            avg_diff_test.append(avg_diff_test_current)

            # Output progress if TPR has decreased (improved evasion)
            # ... or if TPR is the same but avg changes have decreased
            if (TPR < minTPR) or \
                (TPR == minTPR and avg_diff_test_current < minTPR_avg_changes):  # check avg or max
                print("\n>>>> New Best Results: "
                      f"Previous minTPR: [{minTPR:.8f}] ==> "
                      f"New minTPR: [{TPR:0.8f}] "
                      f"GAN Loss: [{gan_loss:.8f}]  <<<<")
                output_progress(epoch, TPR_train, TPR_test,
                                diff_train, diff_test)
                minTPR = TPR
                min_epoch = output_epoch = epoch
                minTPR_avg_changes = avg_diff_test_current
                minTPR_max_changes = max_diff_test_current
                minTPR_std = np.std(diff_test)
                minTPR_quantiles = np.quantile(diff_test, [0.25, 0.5, 0.75])

                # Save weights
                minTPR_weights_path = \
                    (gan_dir + self.save_dir + 'weights/' +
                     f'GAN_minTPR_weights_epoch_{epoch}_'
                     f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_'
                     f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_'
                     + time.strftime("%m-%d_%H-%M-%S") + '.h5')
                self.GAN.save_weights(minTPR_weights_path)

                # Generate and plot a sample of AEs
                sample_sz = 10
                sample_noise = np.random.uniform(zmin, zmax, size=[sample_sz,
                                                                   self.z_dim])

                if sample_train:  # Sample from training
                    sample_mal = rand_batch(X_mal_batch, sample_sz)
                else:  # Sample from test set
                    sample_mal = np.asarray(rand_batch(X_mal_test, sample_sz))

                plot_sample(sample_mal, sample_noise, self.generator,
                            target_model, epoch, TPR_train=TPR_train,
                            TPR_test=TPR_test, params=self.log_params,
                            avg_changes=avg_diff_test_current,
                            m_label=mal_label, g_label=good_label,
                            annotate=False, out_dir=ADV_DIR, plot_id=plot_id)
                plot_id = plot_id + 1

                if minTPR <= minTPR_threshold:
                    print(
                        "\n" + "#" * 150 + "\n"
                        f"# Target Evasion Rate {100 * (1 - TPR):.2f}% "
                        f"achieved at epoch [{epoch}], "
                        f"with avg {avg_diff_test_current:.1f} "
                        f"& max {max_diff_test_current:.1f} changes per sample "
                        f"(on the test set) ... "
                        f"GAN Loss: [{gan_loss:.8f}]"
                        "\n" + "#" * 150 + "\n"
                    )

                    if minTPR_avg_changes <= max_changes:
                        print("Training CONVERGED. "
                            "Target Evasion Rate achieved within max changes..."
                            "TRAINING ENDS HERE #")
                        # Save generator
                        best_G_path = \
                            (gan_dir + self.save_dir + 'models/' +
                            f'G_Target_TPR_epoch_{epoch}_'
                            f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_'
                            f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_'
                            + time.strftime("%m-%d_%H-%M-%S") + '.h5')
                        self.generator.save(best_G_path)

                        if earlystop:
                            break

            # If no better than minTPR, but still achieved target evasion, ...
            elif TPR <= minTPR_threshold:
                # output_epoch = epoch
                print(
                    "\n" + "#" * 150 + "\n"
                    f"# Target Evasion Rate {100 * (1 - TPR):.2f}% "
                    f"achieved at epoch [{epoch}] "
                    f"with avg {avg_diff_test_current:.1f} "
                    f"and max {max_diff_test_current:.1f} changes per sample "
                    f"(on the test set) ... "
                    f"GAN Loss: [{gan_loss:.8f}]"
                    "\n" + "#" * 150 + "\n"
                )

                # Save weights
                weights_path = \
                    (gan_dir + self.save_dir + 'weights/' +
                     f'GAN_minTPR_weights_epoch_{epoch}_'
                     f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_'
                     f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_'
                     + time.strftime("%m-%d_%H-%M-%S") + '.h5')
                # self.GAN.save_weights(file_path)

                # If within max changes
                if avg_diff_test_current <= max_changes:  # check avg or max?
                    print("Target Evasion Rate achieved within max changes...")
                    # Save model
                    model_path = \
                        (gan_dir + self.save_dir + 'models/' +
                        f'GAN_Target_TPR_epoch_{epoch}_'
                        f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_'
                        f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_'
                        + time.strftime("%m-%d_%H-%M-%S") + '.h5')
                    # self.GAN.save(model_path)
                    if earlystop:
                        break
                else:
                    print()
                    # Maybe adjust weights
                    # print("Should we adjust regulizers?")
                    # generator.layers[-2].rate *= 0.1
                    # generator.layers[-3].activity_regularizer.l1 *= 0.1
                    # generator.layers[-3].activity_regularizer.l2 *= 0.1
                    # weights = generator.get_weights()
                    # generator = keras.models.clone_model(generator)
                    # generator.set_weights(weights)
                    # Adapt regularisation weights
                    # K.set_value(l1_factor, 0.1*l1_factor)
                    # K.set_value(l2_factor, 0.1*l2_factor)

            if (epoch + 1 - output_epoch) > n_progress:
                # If no new imporovement for for a while, output progress
                output_epoch = epoch
                print(f"\n*** Checking progress *** "
                      f"GAN Loss: [{gan_loss:.8f}] ***")
                output_progress(epoch, TPR_train, TPR_test,
                                diff_train, diff_test)

                # Generate and plot a sample of AEs
                sample_sz = 10
                sample_noise = np.random.uniform(zmin, zmax, size=[sample_sz,
                                                                   self.z_dim])

                sample_mal = rand_batch(X_mal_batch, sample_sz)

                plot_sample(sample_mal, sample_noise, self.generator,
                            target_model, epoch, TPR_train=TPR_train,
                            TPR_test=TPR_test, params=self.log_params,
                            avg_changes=avg_diff_test_current,
                            m_label=mal_label, g_label=good_label,
                            annotate=False, out_dir=ADV_DIR, plot_id=plot_id)
                plot_id = plot_id + 1

        t2 = time.perf_counter()
        print("\n\n" + "#" * 165 + "\n"
            f"# Finished {epoch + 1} epochs in {(t2 - t1) / 60:.2f} minutes\n"
            f"# Best Evastion Rate = {100 * (1 - minTPR):.4f}% "
            f"(lowest TPR = {100 * minTPR:.4f}%) "
            f"achieved after {min_epoch + 1} epochs, with avg "
            f"{minTPR_avg_changes:.1f} \u00b1 SD({minTPR_std:.1f}) | "
            f" Q1-3  {minTPR_quantiles.astype(int).tolist()} | "
            f" and max {minTPR_max_changes:.1f} "
            f"changes per sample.\n"
            + "#" * 165 + "\n\n")

        return TPR_train, TPR_test, \
               avg_diff_train, avg_diff_test, \
               d_metrics, gan_metrics, \
               best_G_path
data = data.drop(columns=['User_ID', 'Product_ID'])

# Input features and target names definition
in_features = data.columns.drop(['Purchase'])
target = 'Purchase'

# Training and testing split (random split)
random.seed = 0

train_id = random.sample(range(0, data.shape[0]), 440054)
test_id = list(set(np.arange(0, data.shape[0])) - set(train_id))

train_data = data.iloc[train_id, :]
test_data = data.iloc[test_id, :]

train_data["Purchase_level"] = train_data[target] > np.quantile(
    train_data[target], 0.75)
test_data["Purchase_level"] = test_data[target] > np.quantile(
    test_data[target], 0.75)

train_data["Purchase_level"] = train_data["Purchase_level"].apply(
    lambda x: int(x == True))
test_data["Purchase_level"] = test_data["Purchase_level"].apply(
    lambda x: int(x == True))

in_features = train_data.columns.drop(['Purchase_level', 'Purchase'])
target = 'Purchase_level'
# =============================================================================

time_list.append(time.time())
# =============================================================================
# =============================================================================
예제 #8
0
파일: select_data.py 프로젝트: ivvv/xmmpy
def select_data(dataframe, ccd, rawy_range=(1, 200), filter_select=None):
    """Make a selection of data from the input dataframe and return the selected dataframe
    Parameters
    ----------
        dataframe : dataframe, mandatory 
            the pandas dataframe with the Cu Kalpha fit results (from Michael Smith monitoring run). Produced by `ff_monitoring_work2.ipynb`
        ccd : int, mandatory 
            the EPIC-pn CCD number (from 1 to 12)
        rawy_range: list, optional
            the RAWY range selection, default the full CCD range (1,200)
        filter_select : str
            if not None, then a selection on filter wheel is requested, can be one of 'CalClosed', 'CalMedium', 'CalThick', 'CalThin1', 'Closed',
           'Medium', 'Thick', 'Thin1', 'Thin2'. If None, then all are selected.
    Output
    ------
        df_out, pandas dataframe
            A new dataframe with selected records, sorted on `delta_time`

    Method
    ------
        First, a selection based on CCD and RAWY range is done.
        Then the further filtering based on 
            * best-fit Gaussian line sigma, within (16,84)% quantiles
            * exposure time (>= 10 ks)
            * number of dicarded lines (<= 300), only applied for FF mode.
            * best-fit line energy mean error ( <= 20 eV) and neither the upper or lower error bar is zero.
            * if filter_select is used, then also select on filter.
        The filtering is just to discard bad fit or poor fit results.
        And we discard duplicates (if any) based on the `delta_time` (time in years since 2000-01-01) and finally
        sort on `delta_time`.
         
    """
    df_ccd = dataframe[(dataframe.ccd == ccd)
                       & (dataframe.rawy0 == rawy_range[0]) &
                       (dataframe.rawy1 == rawy_range[1])]
    ntot, _ = df_ccd.shape
    df_ccd.xmode = dataframe.xmode
    #
    # get the quantile distribution (16,50,84) of best fit Gaussian line sigma
    # will use th elower and upper quantile to filter the bad fit results
    #
    qq = np.quantile(df_ccd.sigma, (0.16, 0.84))
    fwhm = np.rint(qq * SIG2FWHM).astype(int)
    qq = np.rint(qq).astype(int)
    #
    if (df_ccd.xmode == 0):
        xmode = 'FF'
        if (filter_select is not None):
            df_out = df_ccd[
                (df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0) &
                (df_ccd.rawy0 == rawy_range[0]) &
                (df_ccd.rawy1 == rawy_range[1]) & ((df_ccd.sigma >= qq[0]) &
                                                   (df_ccd.sigma <= qq[1])) &
                #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) &
                ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <= 20.0) &
                (df_ccd.ndl <= 300.0) & (df_ccd['filter'] == filter_select) &
                (df_ccd.energy_err1 * df_ccd.energy_err2 >
                 0.0)].drop_duplicates('delta_time')
        else:
            df_out = df_ccd[(df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0)
                            & (df_ccd.rawy0 == rawy_range[0]) &
                            (df_ccd.rawy1 == rawy_range[1]) &
                            ((df_ccd.sigma >= qq[0]) &
                             (df_ccd.sigma <= qq[1])) &
                            #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) &
                            ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <=
                             20.0) & (df_ccd.ndl <= 300.0) &
                            (df_ccd.energy_err1 * df_ccd.energy_err2 >
                             0.0)].drop_duplicates('delta_time')
    elif (df_ccd.xmode == 1):
        xmode = 'EFF'
        if (filter_select is not None):
            df_out = df_ccd[(df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0)
                            & (df_ccd.rawy0 == rawy_range[0]) &
                            (df_ccd.rawy1 == rawy_range[1]) &
                            ((df_ccd.sigma >= qq[0]) &
                             (df_ccd.sigma <= qq[1])) &
                            #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) &
                            ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <=
                             20.0) & (df_ccd['filter'] == filter_select) &
                            (df_ccd.energy_err1 * df_ccd.energy_err2 >
                             0.0)].drop_duplicates('delta_time')
        else:
            df_out = df_ccd[(df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0)
                            & (df_ccd.rawy0 == rawy_range[0]) &
                            (df_ccd.rawy1 == rawy_range[1]) &
                            ((df_ccd.sigma >= qq[0]) &
                             (df_ccd.sigma <= qq[1])) &
                            #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) &
                            ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <=
                             20.0) & (df_ccd.energy_err1 * df_ccd.energy_err2 >
                                      0.0)].drop_duplicates('delta_time')
        #
    else:
        print(
            f'Cannot process mode={df_ccd.xmode}, only mode=0 (FF) or mode=1 (EFF).'
        )
        return None
    #
    _ = df_out.sort_values(by='delta_time', inplace=True)
    df_out.xmode = dataframe.xmode
    #
    return df_out
예제 #9
0
async def run(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    filterwarnings("ignore",
                   message=".*NVLink.*rmm_pool_size.*",
                   category=UserWarning)

    async with Cluster(*cluster_args, **cluster_kwargs,
                       asynchronous=True) as cluster:
        if args.multi_node:
            import time

            # Allow some time for workers to start and connect to scheduler
            # TODO: make this a command-line argument?
            time.sleep(15)

        # Use the scheduler address with an SSHCluster rather than the cluster
        # object, otherwise we can't shut it down.
        async with Client(scheduler_addr if args.multi_node else cluster,
                          asynchronous=True) as client:
            scheduler_workers = await client.run_on_scheduler(
                get_scheduler_workers)

            await client.run(
                setup_memory_pool,
                disable_pool=args.disable_rmm_pool,
                log_directory=args.rmm_log_directory,
            )
            # Create an RMM pool on the scheduler due to occasional deserialization
            # of CUDA objects. May cause issues with InfiniBand otherwise.
            await client.run_on_scheduler(
                setup_memory_pool,
                pool_size=1e9,
                disable_pool=args.disable_rmm_pool,
                log_directory=args.rmm_log_directory,
            )

            took_list = []
            for i in range(args.runs):
                took_list.append(await _run(client, args))

            # Collect, aggregate, and print peer-to-peer bandwidths
            incoming_logs = await client.run(
                lambda dask_worker: dask_worker.incoming_transfer_log)
            bandwidths = defaultdict(list)
            total_nbytes = defaultdict(list)
            for k, L in incoming_logs.items():
                for d in L:
                    if d["total"] >= args.ignore_size:
                        bandwidths[k, d["who"]].append(d["bandwidth"])
                        total_nbytes[k, d["who"]].append(d["total"])

            bandwidths = {(
                scheduler_workers[w1].name,
                scheduler_workers[w2].name,
            ): [
                "%s/s" % format_bytes(x)
                for x in np.quantile(v, [0.25, 0.50, 0.75])
            ]
                          for (w1, w2), v in bandwidths.items()}
            total_nbytes = {(
                scheduler_workers[w1].name,
                scheduler_workers[w2].name,
            ): format_bytes(sum(nb))
                            for (w1, w2), nb in total_nbytes.items()}

            print("Roundtrip benchmark")
            print("--------------------------")
            print(f"Size         | {args.size}*{args.size}")
            print(f"Chunk-size   | {args.chunk_size}")
            print(f"Ignore-size  | {format_bytes(args.ignore_size)}")
            print(f"Protocol     | {args.protocol}")
            print(f"Device(s)    | {args.devs}")
            if args.device_memory_limit:
                print(
                    f"memory-limit | {format_bytes(args.device_memory_limit)}")
            print("==========================")
            print("Wall-clock   | npartitions")
            print("--------------------------")
            for (took, npartitions) in took_list:
                t = format_time(took)
                t += " " * (12 - len(t))
                print(f"{t} | {npartitions}")
            print("==========================")
            print("(w1,w2)      | 25% 50% 75% (total nbytes)")
            print("--------------------------")
            for (d1, d2), bw in sorted(bandwidths.items()):
                fmt = ("(%s,%s)      | %s %s %s (%s)" if args.multi_node or
                       args.sched_addr else "(%02d,%02d)      | %s %s %s (%s)")
                print(fmt %
                      (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))

            if args.benchmark_json:
                bandwidths_json = {
                    "bandwidth_({d1},{d2})_{i}" if args.multi_node
                    or args.sched_addr else "(%02d,%02d)_%s" % (d1, d2, i):
                    parse_bytes(v.rstrip("/s"))
                    for (d1, d2), bw in sorted(bandwidths.items())
                    for i, v in zip(
                        ["25%", "50%", "75%", "total_nbytes"],
                        [bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]],
                    )
                }

                with open(args.benchmark_json, "a") as fp:
                    for took, npartitions in took_list:
                        fp.write(
                            dumps(
                                dict(
                                    {
                                        "size": args.size * args.size,
                                        "chunk_size": args.chunk_size,
                                        "ignore_size": args.ignore_size,
                                        "protocol": args.protocol,
                                        "devs": args.devs,
                                        "device_memory_limit":
                                        args.device_memory_limit,
                                        "worker_threads":
                                        args.threads_per_worker,
                                        "rmm_pool": not args.disable_rmm_pool,
                                        "tcp": args.enable_tcp_over_ucx,
                                        "ib": args.enable_infiniband,
                                        "nvlink": args.enable_nvlink,
                                        "wall_clock": took,
                                        "npartitions": npartitions,
                                    },
                                    **bandwidths_json,
                                )) + "\n")

            # An SSHCluster will not automatically shut down, we have to
            # ensure it does.
            if args.multi_node:
                await client.shutdown()
        "subject_nickname",
        color='k',
        linewidth=0)  # this will be empty, hack
# now plot the datapoints, no errorbars
sns.lineplot(data=behav.loc[behav.task == 'traini', :],
             x='signed_contrast',
             y='choice2',
             marker='o',
             err_style='bars',
             color='k',
             linewidth=0,
             ci=95,
             ax=fig.ax)  # overlay the simulated
# confidence intervals from the model - shaded regions
fig.ax.fill_between(sorted(behav.signed_contrast.unique()),
                    np.quantile(np.array(simulation_basic), q=0.025, axis=0),
                    np.quantile(np.array(simulation_basic), q=0.975, axis=0),
                    alpha=0.5,
                    facecolor='k')
fig.set_axis_labels(' ', 'Rightward choices (%)')
fig.despine(trim=True)
fig.savefig(os.path.join(figpath, "figure5b_basic_psychfunc.pdf"))

# FULL TASK
plt.close('all')
fig = sns.FacetGrid(behav.loc[behav.task == 'biased', :],
                    hue="probabilityLeft",
                    palette=cmap,
                    sharex=True,
                    sharey=True,
                    height=FIGURE_HEIGHT,
예제 #11
0
def quantile_normalize(im, low=.01, high=.99):
    im = im.astype('float32')
    tlow, thigh = np.quantile(im, low), np.quantile(im, high)
    im -= tlow
    im /= thigh
    return np.clip(im, 0., 1.)
예제 #12
0
def get_radius(dist: torch.Tensor, nu: float):
    """Optimally solve for radius R via the (1-nu)-quantile of distances."""
    return np.quantile(np.sqrt(dist.clone().data.cpu().numpy()), 1 - nu)
예제 #13
0
     eprint("OS error:", err)
 except:
     eprint("unexpected error:", sys.exc_info()[0])
     raise
 else:
     eprint('\ndataset has', len(dataset), 'entries\n')
 # format dataset
 matrix = preprocess(dataset)
 # normalize data
 dataset_normalized = normalize(matrix)
 users_to_recommend = list(dataset_normalized.user.values)
 # split data into training and testing
 training_data, testing_data = split_data(dataset_normalized)
 num_items = len(list(set(training_data.to_dataframe().item.values)))
 # show raw data stats
 raw_variable_quantiles = np.quantile(dataset.variable.values,
                                      [0, .25, .5, .75, 1])
 eprint('\nquantiles:', raw_variable_quantiles, '\n')
 with mlflow.start_run():
     try:
         # train and store model
         model = create_model(training_data)
         # create recomendations
         recom = model.recommend(users=users_to_recommend, k=recom_n)
     except:
         eprint('run failed')
         raise
     else:
         eprint('\nsaving recommendations...\n')
         save_recom(recom)
         # calculate metrics
         eprint('\n*** calculating metrics ***\n')
예제 #14
0
mg = load_metagraph("G", version="2020-04-01")
mg = preprocess(
    mg,
    threshold=0,
    sym_threshold=False,
    remove_pdiff=True,
    binarize=False,
    weight="weight",
)
meta = mg.meta

# plot where we are cutting out nodes based on degree
degrees = mg.calculate_degrees()
fig, ax = plt.subplots(1, 1, figsize=(5, 2.5))
sns.distplot(np.log10(degrees["Total edgesum"]), ax=ax)
q = np.quantile(degrees["Total edgesum"], 0.05)
ax.axvline(np.log10(q), linestyle="--", color="r")
ax.set_xlabel("log10(total synapses)")

# remove low degree neurons
idx = meta[degrees["Total edgesum"] > q].index
mg = mg.reindex(idx, use_ids=True)

# remove center neurons # FIXME
idx = mg.meta[mg.meta["hemisphere"].isin(["L", "R"])].index
mg = mg.reindex(idx, use_ids=True)

mg = mg.make_lcc()
mg.calculate_degrees(inplace=True)
meta = mg.meta
meta["inds"] = range(len(meta))
예제 #15
0
    def test_large_epsilon(self):
        a = np.random.random(1000)
        res = np.quantile(a, 0.5)
        res_dp = quantile(a, 0.5, epsilon=5, bounds=(0, 1))

        self.assertAlmostEqual(float(res), float(res_dp), delta=0.01)
예제 #16
0
#!/usr/bin/env python

import numpy as np

data = range(1000)
q = [0.01, 0.99]
res = np.quantile(data, q)
print('res = {}'.format(res))
예제 #17
0
            if epoch >= 200: sample.append(X)

        predicts.append(numpy.mean(sample, axis=0) / N)

    return numpy.array(predicts)


for N in nlist:
    for alpha in [1.0, 0.1, 0.01]:
        predicts = gibbs_sampling(N, alpha)
        start = predicts.min()
        end = predicts.max()
        bins = 40
        step = (end - start) / bins

        plt.hist(predicts, bins=numpy.arange(start, end, step), density=True)
        plt.title("N = %d, alpha = %.2f" % (N, alpha))
        plt.legend(legend)
        plt.tight_layout()
        plt.savefig("rr-gibbs-%d-%.2f.png" % (N, alpha))
        plt.close()

        print("N=%d, alpha=%.2f, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median" %
              (N, alpha))
        print(
            numpy.vstack(([
                true_prob,
                numpy.mean(predicts, axis=0),
                numpy.std(predicts, axis=0)
            ], numpy.quantile(predicts, [0.025, 0.975, 0.5], axis=0))))
예제 #18
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 18 16:48:42 2019

@author: scott
"""

import re
import numpy as np
import sys

x = []
with open(sys.argv[1], 'r') as tr:
    for line in tr:
        t = re.findall(r'\d?\.?\d+\.?\d?(?=:)|(?<=\):)\d+\.?\d+', line.strip())
        x.append(sum(map(float, t)))
        if sum(map(float, t)) > 10:
            print(line)
print(np.mean(x))
print(np.quantile(x, 0.025))
print(np.quantile(x, 0.975))
예제 #19
0
    def fit(self, X, y, implement_fixed_controls=False, patholog_dirn=None):
        #* Requires direction of disease progression as input
        if patholog_dirn is None:
            patholog_dirn = disease_direction(X, y)

        # ####### Diagnostic
        # if patholog_dirn < 0:
        #     print('kde.py DIAGNOSTIC: fit(), Disease progresses with decreasing biomarker values - ')
        # elif patholog_dirn > 0:
        #     print('kde.py DIAGNOSTIC: fit(), Disease progresses with increasing biomarker values + ')
        # else:
        #     print('kde.py DIAGNOSTIC. fit(), ERROR: Disease direction in fit(...,patholog_dirn) must be either positive or negative. \n patholog_dirn = {0]}'.format(patholog_dirn))
        # #######

        sorted_idx = X.argsort(axis=0).flatten()
        kde_values = X.copy()[sorted_idx].reshape(-1, 1)
        kde_labels0 = y.copy()[sorted_idx]
        kde_labels = kde_labels0

        #print('Original labels')
        #print(kde_labels.astype(int))

        bin_counts = np.bincount(y).astype(float)
        mixture0 = sum(kde_labels == 0) / len(
            kde_labels)  # Prior of being a control
        mixture = mixture0
        old_ratios = np.zeros(kde_labels.shape)
        iter_count = 0
        if (self.bandwidth is None):
            #* 1. Rule of thumb
            self.bandwidth = hscott(X)
            # #* 2. Estimate full density to inform variable bandwidth: wide in tails, narrow in peaks
            # all_kde = neighbors.KernelDensity(kernel=self.kernel,
            #                                   bandwidth=self.bandwidth)
            # all_kde.fit(kde_values)
            # f = np.exp(all_kde.score_samples(kde_values))
            # #* 3. Local, a.k.a. variable, bandwidth given by eq. 3 of https://ieeexplore.ieee.org/abstract/document/7761150
            # g = stats.mstats.gmean(f)
            # alpha = 0.5 # sensitivity parameter: 0...1
            # lamb = np.power(f/g,-alpha)
        for i in range(self.n_iters):

            # print('Iteration {0}. kde_labels = {1}'.format(i,[int(k) for k in kde_labels]))

            #* Automatic variable/local bandwidth for each component: awkde package from github
            controls_kde = GaussianKDE(glob_bw="scott",
                                       alpha=self.beta,
                                       diag_cov=False)
            patholog_kde = GaussianKDE(glob_bw="scott",
                                       alpha=self.alpha,
                                       diag_cov=False)
            # controls_kde = GaussianKDE(glob_bw="scott", alpha=0.1, diag_cov=False)
            # patholog_kde = GaussianKDE(glob_bw="scott", alpha=0.1, diag_cov=False)
            controls_kde.fit(kde_values[kde_labels == 0])
            patholog_kde.fit(kde_values[kde_labels == 1])

            controls_score = controls_kde.predict(kde_values)
            patholog_score = patholog_kde.predict(kde_values)

            controls_score = controls_score * mixture
            patholog_score = patholog_score * (1 - mixture)

            ratio = controls_score / (controls_score + patholog_score)

            # print('Iteration {0}. ratio (percent) = {1}'.format(i,[int(r*100) for r in ratio]))

            #* Empirical cumulative distribution: used to swap labels for patients with super-normal values (greater/less than CDF=0.5)
            cdf_controls = np.cumsum(controls_score) / max(
                np.cumsum(controls_score))
            cdf_patholog = np.cumsum(patholog_score) / max(
                np.cumsum(patholog_score))
            cdf_diff = (cdf_patholog - cdf_controls) / (cdf_patholog +
                                                        cdf_controls)
            disease_dirn = -np.sign(np.nansum(
                cdf_diff))  # disease_dirn = -np.sign(np.mean(cdf_diff))
            if disease_dirn > 0:
                cdf_direction = 1 + cdf_diff
            else:
                cdf_direction = -cdf_diff

            #* Identify "normal" biomarkers as being on the healthy side of the controls median => flip patient labels
            if patholog_dirn < 0:
                #* More normal (greater) than half the controls: CDF_controls > 0.5
                labels_forced_normal = cdf_controls > 0.5
                labels_forced_normal_alt = kde_values > np.median(
                    kde_values[kde_labels0 == 0])
            elif patholog_dirn > 0:
                #* More normal (less)    than half the controls: CDF_controls < 0.5
                labels_forced_normal = cdf_controls < 0.5
                labels_forced_normal_alt = kde_values < np.median(
                    kde_values[kde_labels0 == 0])

            #* FIXME: Make this a prior and change the mixture modelling to be Bayesian
            #* First iteration only: implement "prior" that flips healthy-looking patients (before median for controls) to pre-event label
            #* Refit the KDEs at this point
            if i == 0:
                #* Disease direction: force pre-event/healthy-looking patients to flip
                kde_labels[np.where(labels_forced_normal_alt)[0]] = 0
                bin_counts = np.bincount(kde_labels).astype(float)
                mixture = bin_counts[0] / bin_counts.sum()
                #* Refit the KDE components. FIXME: this is copy-and-paste from above. Reimplement in a smarter way.
                controls_kde.fit(kde_values[kde_labels == 0])
                patholog_kde.fit(kde_values[kde_labels == 1])
                controls_score = controls_kde.predict(kde_values)
                patholog_score = patholog_kde.predict(kde_values)
                controls_score = controls_score * mixture
                patholog_score = patholog_score * (1 - mixture)
                ratio = controls_score / (controls_score + patholog_score)
                #* Empirical cumulative distribution: used to swap labels for patients with super-normal values (greater/less than CDF=0.5)
                cdf_controls = np.cumsum(controls_score) / max(
                    np.cumsum(controls_score))
                cdf_patholog = np.cumsum(patholog_score) / max(
                    np.cumsum(patholog_score))
                cdf_diff = (cdf_patholog - cdf_controls) / (cdf_patholog +
                                                            cdf_controls)
                disease_dirn = -np.sign(np.nansum(
                    cdf_diff))  # disease_dirn = -np.sign(np.mean(cdf_diff))
                if disease_dirn > 0:
                    cdf_direction = 1 + cdf_diff
                    # print('Disease direction is estimated to be POSTIIVE')
                else:
                    cdf_direction = -cdf_diff
                    # print('Disease direction is estimated to be NEGATIVE')
                #* Identify "normal" biomarkers as being on the healthy side of the controls median => flip patient labels
                if patholog_dirn < 0:
                    #* More normal (greater) than half the controls: CDF_controls > 0.5
                    labels_forced_normal = cdf_controls > 0.5
                    labels_forced_normal_alt = kde_values > np.median(
                        kde_values[kde_labels0 == 0])
                elif patholog_dirn > 0:
                    #* More normal (less)    than half the controls: CDF_controls < 0.5
                    labels_forced_normal = cdf_controls < 0.5
                    labels_forced_normal_alt = kde_values < np.median(
                        kde_values[kde_labels0 == 0])

            if (np.all(ratio == old_ratios)):
                # print('MM finished in {0} iterations'.format(iter_count))
                break
            iter_count += 1
            old_ratios = ratio
            kde_labels = ratio < 0.5

            #* Labels to swap:
            diff_y = np.hstack(
                ([0], np.diff(kde_labels)))  # !=0 where adjacent labels differ

            if ((np.sum(diff_y != 0) >= 2) &
                (np.unique(kde_labels).shape[0] == 2)):
                split_y = int(
                    np.all(np.diff(np.where(kde_labels == 0)) == 1)
                )  # kde_label upon which to split: 1 if all 0s are adjacent, 0 otherwise
                sizes = [
                    x.shape[0] for x in np.split(diff_y,
                                                 np.where(diff_y != 0)[0])
                ]  # lengths of each contiguous set of labels

                #* Identify which labels to swap using direction of abnormality: avg(controls) vs avg(patients)
                #* N ote that this is now like k-medians clustering, rather than k-means
                split_prior_smaller = (np.median(
                    kde_values[kde_labels == split_y]) < np.median(
                        kde_values[kde_labels == (split_y + 1) % 2]))
                if split_prior_smaller:
                    replace_idxs = np.arange(kde_values.shape[0])[
                        -sizes[2]:]  # greater values are swapped
                else:
                    replace_idxs = np.arange(
                        kde_values.shape[0]
                    )[:sizes[0]]  # lesser values are swapped
                kde_labels[replace_idxs] = (split_y + 1) % 2  # swaps labels

            #* Disease direction: force pre-event/healthy-looking patients to flip
            kde_labels[np.where(labels_forced_normal_alt)[0]] = 0

            #*** Prevent label swapping for "strong controls"
            fixed_controls_criteria_0 = (kde_labels0 == 0)  # Controls
            # #*** CDF criteria - do not delete: potentially also used for disease direction
            # en = 10
            # cdf_threshold = (en-1)/(en+1) # cdf(p) = en*(1-cdf(c)), i.e., en-times more patients than remaining controls
            # controls_tail = cdf_direction > (cdf_threshold * max(cdf_direction))
            # #fixed_controls_criteria_0 = fixed_controls_criteria_0 & (~controls_tail)
            # #*** PDF ratio criteria
            # ratio_threshold_strong_controls = 0.33 # P(control) / [P(control) + P(patient)]
            # fixed_controls_criteria = fixed_controls_criteria & (ratio > ratio_threshold_strong_controls) # "Strong controls"
            #*** Outlier criteria for weak (e.g., low-performing on test; or potentially prodromal in sporadic disease) controls: quantiles
            q = 0.90  # x-tiles
            if disease_dirn > 0:
                q = q  # upper
                f = np.greater
                g = np.less
                # print('Disease direction: positive')
            else:
                q = 1 - q  # lower
                f = np.less
                g = np.greater
                # print('Disease direction: negative')
            extreme_cases = f(kde_values,
                              np.quantile(kde_values,
                                          q)).reshape(-1,
                                                      1)  #& (kde_labels0==0)
            fixed_controls_criteria = fixed_controls_criteria_0.reshape(
                -1, 1) & ~(extreme_cases)
            if implement_fixed_controls:
                kde_labels[np.where(fixed_controls_criteria)[0]] = 0
                #kde_labels[np.where(controls_outliers)[0]] = 1 # Flip outlier controls

            bin_counts = np.bincount(kde_labels).astype(float)
            mixture = bin_counts[0] / bin_counts.sum()
            if (mixture < 0.10 or mixture >
                    0.90):  # if(mixture < (0.90*mixture0) or mixture > 0.90):
                # print('MM finished (mixture weight too low/high) in {0} iterations'.format(iter_count))
                break
        self.controls_kde = controls_kde
        self.patholog_kde = patholog_kde
        self.mixture = mixture
        self.iter_ = iter_count
        return self
MZ = []
peak_no = 0
peak_clusters_no = 0
add_clusters = 0
for pc in ppm_dist_clusters(sp.peaks(), 100.0):
    if len(pc) > len(sp)*quorum and pc.peaks_from_different_spectra():
        for p in pc:
            MZ.append(p.mz)
        peak_no += len(pc)
        peak_clusters_no += 1
        add_clusters += max(pc.which_spectra().values()) - 1
MZ = np.array(MZ)
dMZ = np.diff(MZ)

last_peak_diff = (peak_no-1-peak_clusters_no)/(peak_no-1)
quantile_distance = np.quantile(2*dMZ/(MZ[1:]+MZ[:-1])*1e6,
								last_peak_diff)

P = np.linspace( 0,1,10000)
Q = np.quantile(2*dMZ/(MZ[1:]+MZ[:-1])*1e6, P)

plt.plot(Q, P)
plt.scatter(quantile_distance, last_peak_diff)
plt.show()

# what if we modified the k?
MZ = A([p.mz for p in sp.peaks()])
N = A([p.spec_no for p in sp.peaks()])
dMZ = np.diff(MZ)
peak_no = len(MZ)

last_peak_diff = (peak_no-1-peak_clusters_no)
예제 #21
0
print("Using all the historical data, without the year 2020, for the month =", m, \
        ", during the week between days", fd, "and", ed, "the mean flow is =", flow_mean1)
print("")
print("Using the last 10 years data, without the year 2020, for the month =", m, \
        ", during the week between days", fd, "and", ed, "the mean flow is =", flow_mean2)
print("")
print("In the year", y, "for the month =", m, ", during the week between days", \
        fd, "and", ed, "the mean flow is =", flow_mean3)
print("")

# %%
# Quantiles
# historical quantiles, without the year 2020.
flow_quants1 = np.quantile(flow_data[(flow_data[:,0] != 2020) & \
        (flow_data[:,1] == m) & (flow_data[:,2] >= fd) & (flow_data[:,2] <= ed),3],\
                 q=[0,0.33,0.5,0.66,1.0])

# Quantiles since the year 2009 (the last 10 years), without the year 2020.
flow_quants2 = np.quantile(flow_data[(flow_data[:,0] != 2020) & (flow_data[:,0] >= 2009) &\
         (flow_data[:,1] == m) & (flow_data[:,2] >= fd) & (flow_data[:,2] <= ed),3],\
                  q=[0,0.33,0.5,0.66,1.0])

# Quantiles for a specific year.
flow_quants3 = np.quantile(flow_data[(flow_data[:,0] == y) & (flow_data[:,1] == m) &\
         (flow_data[:,2] >= fd) & (flow_data[:,2] <= ed),3], q=[0,0.33,0.5,0.66,1.0])

print("All years, month =", m, "week between days:", fd, "and", ed)
print("min, 33%, median, 66%, max")
print(flow_quants1)
print("")
 def quartile(data):
     q1 = np.quantile(data, .25)
     q2 = np.quantile(data, .50)
     q3 = np.quantile(data, .75)
     return q1, q2, q3
]

# Show
sent_topics_sorteddf_mallet.head(10)

# In[ ]:

doc_lens = [len(d) for d in df_dominant_topic.Text]

# Plot
plt.figure(figsize=(16, 7), dpi=160)
plt.hist(doc_lens, bins=1000, color='navy')
plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(750, 90, "Median : " + str(round(np.median(doc_lens))))
plt.text(750, 80, "Stdev   : " + str(round(np.std(doc_lens))))
plt.text(750, 70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(750, 60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(xlim=(0, 1000),
              ylabel='Number of Documents',
              xlabel='Document Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0, 1000, 9))
plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.show()

# In[ ]:

import seaborn as sns
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()
def plot_trajectories(exper_dir, events, embeddings, word, word_step,
                      font_size):
    embs_dir = os.path.join(exper_dir, 'embs')
    tsne_output = os.path.join(exper_dir, 'visualization')
    vocabulary = os.path.join(embs_dir, 'wordIDHash.csv')
    wordlist = []
    fid = open(vocabulary, 'r')
    for line in fid:
        word_id, _word = line.strip().split(',')
        wordlist.append(_word)
    fid.close()

    word2Id = {}
    for k in range(len(wordlist)):
        word2Id[wordlist[k]] = k

    times = get_points(
        embs_dir)  # total number of time points (20/range(27) for ngram/nyt)

    emb_all = sio.loadmat(embeddings)

    emb = emb_all[f'U_{times[-1]}']
    nn = emb.shape[1]

    X = []
    list_of_words = []
    isword = []
    words_by_period = {}
    for year in times:
        emb = emb_all[f'U_{year}']
        embnrm = np.reshape(np.sqrt(np.sum(emb**2, 1)), (emb.shape[0], 1))
        emb_normalized = np.divide(emb, np.tile(embnrm, (1, emb.shape[1])))
        print(emb_normalized.shape)
        v = emb_normalized[word2Id[word], :]

        d = np.dot(emb_normalized, v)

        idx = np.argsort(d)[::-1]
        newwords = [(wordlist[k], year) for k in list(idx[:nn])]
        print(newwords)
        list_of_words.extend(newwords)
        words_by_period[year] = list(map(lambda word: word[0], newwords))
        for k in range(nn):
            isword.append(k == 0)
        X.append(emb[idx[:nn], :])

    X = np.vstack(X)
    print(X.shape)

    import matplotlib.pyplot as plt
    import pickle

    import umap
    model = umap.UMAP(n_neighbors=10,
                      min_dist=0.75,
                      metric='cosine',
                      random_state=1)
    Z = model.fit_transform(X)

    traj_fig, traj_ax = plt.subplots(1, 1)
    traj = []
    target_indexes = []
    not_target_indexes = []
    sum_of_coor = dict()
    for k in range(len(list_of_words)):
        k_word = list_of_words[k][0]  # e.g.: guayaquil
        period = list_of_words[k][
            1]  # e.g.: 0 if first week, 1 if second week, etc.
        if isword[k]:
            target_indexes.append(k)
            marker = 's'
            color = 'red' if period in events else 'dodgerblue'
            traj.append(Z[k, :])
            traj_ax.plot(Z[k, 0], Z[k, 1], marker, color=color, markersize=7)

            # plot only a few labels for clarity
            if period % word_step == 0 or period == times[-1]:
                traj_ax.text(Z[k, 0],
                             Z[k, 1],
                             f'{k_word}::{period}',
                             fontsize=font_size)
            else:
                traj_ax.text(Z[k, 0], Z[k, 1], f'{period}', fontsize=font_size)
        else:
            not_target_indexes.append(k)
            sum_of_coor[k_word] = sum_of_coor.get(k_word, np.zeros(2))
            sum_of_coor[k_word] += Z[k]

    distances = []
    for i in target_indexes:
        differences = Z[not_target_indexes] - Z[i]
        distances.extend(np.linalg.norm(differences, axis=1))
    dist_threshold = np.quantile(distances, 0.95)
    print('==', dist_threshold)

    def plot_word(word_index, k_word, list_of_words):
        period = list_of_words[word_index][
            1]  # e.g.: 0 if first week, 1 if second week, etc.
        traj_ax.plot(Z[word_index, 0],
                     Z[word_index, 1],
                     'o',
                     color='mediumseagreen')
        traj_ax.text(Z[word_index, 0],
                     Z[word_index, 1],
                     f'{k_word}::{period}',
                     fontsize=font_size)

    plot_indexes = set()
    plot_words = dict()
    for i in target_indexes:
        differences = Z[not_target_indexes] - Z[i]
        distances = np.linalg.norm(differences, axis=1)
        closest = sorted(zip(distances, not_target_indexes))
        top_threshold = 20
        for distance, word_index in closest[:top_threshold]:
            if distance < dist_threshold and not word_index in plot_indexes:
                k_word = list_of_words[word_index][0]  # e.g.: guayaquil
                if plot_words.get(k_word) is None:
                    plot_word(word_index, k_word, list_of_words)
                    plot_indexes.add(word_index)
                    plot_words[k_word] = np.array([Z[word_index]])
                else:
                    differences = plot_words[k_word] - Z[word_index]
                    distances = np.linalg.norm(differences, axis=1)
                    if distances[distances < 1].shape[0] == 0:
                        plot_word(word_index, k_word, list_of_words)
                        plot_indexes.add(word_index)
                        plot_words[k_word] = np.append(plot_words[k_word],
                                                       [Z[word_index]],
                                                       axis=0)

    traj = np.vstack(traj)
    traj_ax.plot(traj[:, 0], traj[:, 1], linewidth=2)
    plt.show()

    def get_semantic_change(vectors, metric):
        distances = []
        for i in range(1, vectors.shape[0]):
            if metric == 'euclidean':
                distance = np.linalg.norm(vectors[i] - vectors[i - 1])
            elif metric == 'cosine':
                distance = scipy.spatial.distance.cosine(
                    vectors[i], vectors[i - 1])
            distances.append(distance)
        return distances

    def plot_semantic_change(data):
        fig, ax = plt.subplots(1, 1, figsize=(15, 5))
        ax.plot(data)
        # ax.plot(acum_distances)
        ax.set_ylabel('Distancia')
        ax.set_xlabel('Semana')
        ax.legend(
            ['Distancia entre semanas', 'Distancia entre semanas acumulada'])
        plt.show()

    change_2d = get_semantic_change(traj, 'euclidean')
    change_50d = get_semantic_change(X[target_indexes], 'euclidean')
    change_50d_cosine = get_semantic_change(X[target_indexes], 'cosine')

    plot_semantic_change(change_2d)
    plot_semantic_change(change_50d)
    plot_semantic_change(change_50d_cosine)

    target_word_dir = os.path.join(tsne_output, word)
    if not os.path.isdir(target_word_dir):
        os.makedirs(target_word_dir)
    sio.savemat(os.path.join(target_word_dir, 'embs.mat'), {'emb': Z})
    pickle.dump({
        'words': list_of_words,
        'isword': isword
    }, open(os.path.join(target_word_dir, 'wordlist.pkl'), 'wb'))
    for period, context_words in words_by_period.items():
        lines = []
        for context_word in context_words:
            lines.append(f'{word2Id[context_word]},{context_word}\n')
        with open(
                os.path.join(target_word_dir,
                             f'closer2{word}_week_{period}.csv'), 'w') as file:
            file.writelines(lines)

    allwords = ['art', 'damn', 'gay', 'hell', 'maid', 'muslim']

    import matplotlib.pyplot as plt
    import pickle
    Z = sio.loadmat(os.path.join(target_word_dir, 'embs.mat'))['emb']
    data = pickle.load(
        open(os.path.join(target_word_dir, 'wordlist.pkl'), 'rb'))
    list_of_words, isword = data['words'], data['isword']
    plt.clf()
    traj = []

    Zp = Z * 1.
    Zp[:, 0] = Zp[:, 0] * 2.
    all_dist = np.zeros((Z.shape[0], Z.shape[0]))
    for k in range(Z.shape[0]):
        all_dist[:, k] = np.sum((Zp - np.tile(Zp[k, :], (Z.shape[0], 1)))**2.,
                                axis=1)

    dist_to_centerpoints = all_dist[:, isword]
    dist_to_centerpoints = np.min(dist_to_centerpoints, axis=1)

    dist_to_other = all_dist + np.eye(Z.shape[0]) * 1000.
    idx_dist_to_other = np.argsort(dist_to_other, axis=1)
    dist_to_other = np.sort(dist_to_other, axis=1)

    plt.clf()
    for k in range(len(list_of_words) - 1, -1, -1):

        if isword[k]:
            #if list_of_words[k][1] % 3 != 0 and list_of_words[k][1] < 199 : continue
            marker = 'bo'
            traj.append(Z[k, :])
            plt.plot(Z[k, 0], Z[k, 1], marker)
        else:
            if dist_to_centerpoints[k] > 200: continue
            skip = False
            for i in range(Z.shape[0]):
                if dist_to_other[k, i] < 150 and idx_dist_to_other[k, i] > k:
                    skip = True
                    break
                if dist_to_other[k, i] >= 150: break

            if skip: continue
            if Z[k, 0] > 8: continue
            plt.plot(Z[k, 0], Z[k, 1])

        plt.text(Z[k, 0] - 2, Z[k, 1] + np.random.randn() * 2,
                 ' %s-%d' % (list_of_words[k][0], list_of_words[k][1] * 10))

    plt.axis('off')
    traj = np.vstack(traj)
    plt.plot(traj[:, 0], traj[:, 1])
    plt.show()
예제 #25
0
파일: utils.py 프로젝트: potatoxia/bilby
 def upper_absolute_credible_interval(self):
     """ Absolute upper value of the credible interval """
     return np.quantile(self.samples, self._upper_level, axis=0)
예제 #26
0
def _fit_biases(X, dilations, num_features_per_dilation, quantiles, seed):

    if seed is not None:
        np.random.seed(seed)

    n_instances, n_timepoints = X.shape

    # equivalent to:
    # >>> from itertools import combinations
    # >>> indices = np.array([_ for _ in combinations(np.arange(9), 3)])
    indices = np.array(
        (
            0,
            1,
            2,
            0,
            1,
            3,
            0,
            1,
            4,
            0,
            1,
            5,
            0,
            1,
            6,
            0,
            1,
            7,
            0,
            1,
            8,
            0,
            2,
            3,
            0,
            2,
            4,
            0,
            2,
            5,
            0,
            2,
            6,
            0,
            2,
            7,
            0,
            2,
            8,
            0,
            3,
            4,
            0,
            3,
            5,
            0,
            3,
            6,
            0,
            3,
            7,
            0,
            3,
            8,
            0,
            4,
            5,
            0,
            4,
            6,
            0,
            4,
            7,
            0,
            4,
            8,
            0,
            5,
            6,
            0,
            5,
            7,
            0,
            5,
            8,
            0,
            6,
            7,
            0,
            6,
            8,
            0,
            7,
            8,
            1,
            2,
            3,
            1,
            2,
            4,
            1,
            2,
            5,
            1,
            2,
            6,
            1,
            2,
            7,
            1,
            2,
            8,
            1,
            3,
            4,
            1,
            3,
            5,
            1,
            3,
            6,
            1,
            3,
            7,
            1,
            3,
            8,
            1,
            4,
            5,
            1,
            4,
            6,
            1,
            4,
            7,
            1,
            4,
            8,
            1,
            5,
            6,
            1,
            5,
            7,
            1,
            5,
            8,
            1,
            6,
            7,
            1,
            6,
            8,
            1,
            7,
            8,
            2,
            3,
            4,
            2,
            3,
            5,
            2,
            3,
            6,
            2,
            3,
            7,
            2,
            3,
            8,
            2,
            4,
            5,
            2,
            4,
            6,
            2,
            4,
            7,
            2,
            4,
            8,
            2,
            5,
            6,
            2,
            5,
            7,
            2,
            5,
            8,
            2,
            6,
            7,
            2,
            6,
            8,
            2,
            7,
            8,
            3,
            4,
            5,
            3,
            4,
            6,
            3,
            4,
            7,
            3,
            4,
            8,
            3,
            5,
            6,
            3,
            5,
            7,
            3,
            5,
            8,
            3,
            6,
            7,
            3,
            6,
            8,
            3,
            7,
            8,
            4,
            5,
            6,
            4,
            5,
            7,
            4,
            5,
            8,
            4,
            6,
            7,
            4,
            6,
            8,
            4,
            7,
            8,
            5,
            6,
            7,
            5,
            6,
            8,
            5,
            7,
            8,
            6,
            7,
            8,
        ),
        dtype=np.int32,
    ).reshape(84, 3)

    num_kernels = len(indices)
    num_dilations = len(dilations)

    num_features = num_kernels * np.sum(num_features_per_dilation)

    biases = np.zeros(num_features, dtype=np.float32)

    feature_index_start = 0

    for dilation_index in range(num_dilations):

        dilation = dilations[dilation_index]
        padding = ((9 - 1) * dilation) // 2

        num_features_this_dilation = num_features_per_dilation[dilation_index]

        for kernel_index in range(num_kernels):

            feature_index_end = feature_index_start + num_features_this_dilation

            _X = X[np.random.randint(n_instances)]

            A = -_X  # A = alpha * X = -X
            G = _X + _X + _X  # G = gamma * X = 3X

            C_alpha = np.zeros(n_timepoints, dtype=np.float32)
            C_alpha[:] = A

            C_gamma = np.zeros((9, n_timepoints), dtype=np.float32)
            C_gamma[9 // 2] = G

            start = dilation
            end = n_timepoints - padding

            for gamma_index in range(9 // 2):

                C_alpha[-end:] = C_alpha[-end:] + A[:end]
                C_gamma[gamma_index, -end:] = G[:end]

                end += dilation

            for gamma_index in range(9 // 2 + 1, 9):

                C_alpha[:-start] = C_alpha[:-start] + A[start:]
                C_gamma[gamma_index, :-start] = G[start:]

                start += dilation

            index_0, index_1, index_2 = indices[kernel_index]

            C = C_alpha + C_gamma[index_0] + C_gamma[index_1] + C_gamma[index_2]

            biases[feature_index_start:feature_index_end] = np.quantile(
                C, quantiles[feature_index_start:feature_index_end])

            feature_index_start = feature_index_end

    return biases
# Medidas de centralidade

salario_jogadores = [
    40000, 18000, 12000, 250000, 30000, 140000, 300000, 40000, 800000
]

np.mean(salario_jogadores)  # média

np.std(salario_jogadores, ddof=1)  # desvio-padrão

np.var(salario_jogadores)  # variância

np.median(salario_jogadores)  # mediana

np.quantile(salario_jogadores, [0, .25, .50, .75, 1])  # quartis

# Criando duas funções de amostragem
''' Criando uma função que pega um data.frame e retorna casos aleatórios de 
    acordo com o número desejado do novo n. Se N / n < 1, a função retorna os
    primeiros n casos de data.frame.
'''


def sortearESeguir(df, amostra):
    k = int(len(df) / amostra)
    random_n = np.random.randint(low=1, high=k + 1, size=1)
    acumulador = random_n[0]
    sorteados = []

    for i in range(amostra):
예제 #28
0
파일: fig1.py 프로젝트: mpaquette/axDiamFig
    bayes_mean_stat_max = np.array([s[0].minmax[1] for s in bcv])

    bayes_std_stat = np.array([s[2].statistic for s in bcv])
    bayes_std_stat_min = np.array([s[2].minmax[0] for s in bcv])
    bayes_std_stat_max = np.array([s[2].minmax[1] for s in bcv])

    ## estimating data median to get a left and right side X% interval
    interval = 0.8
    peak_diams_mean = np.zeros(fit_data.shape[0])
    lower_diams_mean = np.zeros(fit_data.shape[0])
    upper_diams_mean = np.zeros(fit_data.shape[0])

    for i in range(fit_data.shape[0]):

        peak_diams_mean[i] = bayes_mean_stat[i]
        lower_diams_mean[i] = np.quantile(
            fit_data[i][fit_data[i] <= bayes_mean_stat[i]], 1 - interval)
        upper_diams_mean[i] = np.quantile(
            fit_data[i][fit_data[i] >= bayes_mean_stat[i]], interval)

    dpi = 100
    pl.figure(figsize=(10, 10), dpi=dpi)

    jitter_intensity = 0.5
    step = (diams[1:] - diams[:-1]).mean()
    jitter = (0.5 - np.random.rand(
        Ntrial * diams.shape[0])) * step * jitter_intensity
    pl.scatter((np.repeat(diams, Ntrial) + jitter) * 1e6,
               fit_data.ravel() * 1e6,
               color='red',
               alpha=0.01,
               edgecolors="none")
def evaluate(prediction_path, country, drawing_area, covid_stats):
    pred_image = Image.open(prediction_path)
    country_image = Image.open(f"{constants.IMAGES_PATH}/{country}.jpg")
    if not pred_image.size == country_image.size:
        logging.error(
            "The size of the submitted image is not equal to the original size."
        )
        return "The size of the submitted image is not equal to the original size. Please try again."

    size = pred_image.size[1], pred_image.size[0]
    pred_data = np.sum(np.array(pred_image.getdata()), axis=1).reshape(size)
    country_data = np.sum(np.array(country_image.getdata()),
                          axis=1).reshape(size)

    x0, y0, x1, y1, x_factor, y_factor = drawing_area
    diff = np.abs(pred_data - country_data).T[int(x0):int(x1), int(y0):int(y1)]
    x_offset = x0 % 1
    y_offset = y0 % 1

    line_pixels = []
    for row in diff:
        if np.max(row) < 150:
            line_pixels.append(np.array([]))
        else:
            line_pixels.append(
                np.argwhere(row >= np.max(row) * LINE_THRESHOLD))

    for i in line_pixels:
        if len(i):
            break
    else:
        logging.error("No line was found.")
        return "No line was found. Please try again."

    thicknesses = []
    for column in line_pixels:
        if len(column) > 1:
            thicknesses.append(max(column) - min(column))
    line_thickness = np.quantile(thicknesses, 0.2)

    line = []
    for row in line_pixels:
        if not len(row):
            line.append(float("nan"))
        else:
            line.append(np.min(row) + line_thickness / 2)

    data = covid_stats.get("date", "new_cases_smoothed", location=country)

    for i in range(1, 4):
        try:
            last_date, last_value = date2num(data[-i][0]), float(data[-i][1])
            break
        except ValueError:
            logging.error(f"No data for {country} available (attempt {i}).")
    else:
        raise ValueError(f"There is no readable data for {country}.")

    raw_predictions = dict()
    raw_predictions[date2num(datetime.date.today())] = last_value
    last = None
    for i, point in enumerate(line):
        if not np.isnan(point):
            cases = (y1 - y0 - y_offset - point) * y_factor
            if cases < 0: cases = 0
            last = raw_predictions[date2num(datetime.date.today()) +
                                   (x_offset + i) * x_factor] = cases
    if not line or last is None:
        return "No line was found. Please try again."

    raw_predictions[date2num(datetime.date.today() + datetime.timedelta(
        days=charts.N_PREDICTED_DAYS))] = last
    return raw_predictions
예제 #30
0
    with open(path_members) as file_member:
        for line in file_member:
            fields = line.rstrip('\n').split('\t')
            members.append(fields[1])

    # Open alignments and calculate statistics
    means = []
    stds = []
    iqrs = []
    for member in members:
        with gzip.open(dir_msa + member + '.raw_alg.faa.gz', 'rt') as file:
            MSA = AlignIO.read(file, 'fasta')
            fractions = fraction_ungapped(MSA)
            means.append(stats.tmean(fractions))
            stds.append(stats.tstd(fractions))
            iqrs.append(quantile(fractions, 0.75) - quantile(fractions, 0.25))

    # Save statistics to folder
    root, _ = os.path.splitext(os.path.basename(
        path_members))  # Get name of member file without extension
    if not os.path.exists('out/' + root):
        os.makedirs('out/' + root)  # Recursive folder creation

    with open('out/' + root + '/means.json', 'w') as file:
        json.dump(means, file)
    with open('out/' + root + '/stds.json', 'w') as file:
        json.dump(stds, file)
    with open('out/' + root + '/iqrs.json', 'w') as file:
        json.dump(iqrs, file)
"""
DEPENDENCIES
예제 #31
0
    for i in range(len(region_proposals)):
        start, end, score = int(region_proposals[i][0]), int(
            region_proposals[i][1]), region_proposals[i][2]
        if np.isnan(score):
            print("i=", i, "score = ", score)
        anomaly_time_scores[start:end] += score / np.power(end - start, -0.2)
        anomaly_time_weights[start:end] += 1 / np.power(end - start, -0.2)

    anomaly_time_scores_aver = anomaly_time_scores / anomaly_time_weights
    print("np.corrcoeff time score:",
          np.corrcoef(anomaly_time_scores_aver, anomaly_level))
    np.save(args.output_score + ".npy", anomaly_time_scores_aver)

    max_f1 = 0
    for pred_th in np.linspace(np.quantile(anomaly_time_scores_aver, 0.8),
                               np.quantile(anomaly_time_scores_aver, 0.99),
                               200):
        res = eval_measure(anomaly_level,
                           anomaly_time_scores_aver,
                           test_th=0.5,
                           pred_th=pred_th)

        if res[2] > max_f1:
            max_f1 = res[2]
        print("for pred_th = ", pred_th, "res = ", res)
    print("max f1: ", max_f1)

    plt.figure(figsize=(20, 10))

    range2 = np.arange(0, data_attack.shape[0])
        def run(self):

            print('_____>>>', self.filepath)
            if str(self.filepath).endswith('.h5'):
                print('loading from .h5')
                file = h5py.File(self.filepath, 'r')
                print('UUUU', file.keys())
                data_esr_norm = file['esr_map']
                self.frequencies = file['frequency']
                # print('loading freq from data_subscripts')

                #
                # sub_fs = glob.glob(os.path.join(os.path.dirname(self.filepath), 'data_subscripts/*'))
                # print('sssss', sub_fs)
                #
                # print('ASAAAA', sub_fs[0])
                #
                #
                # f = glob.glob(os.path.join(os.path.dirname(self.filepath), 'data_subscripts/*'))[0]
                # data = Script.load_data(f)
                # self.frequencies = data['frequency']

            else:
                print('loading from data_subscripts')
                data_esr = []
                for f in sorted(glob.glob(os.path.join(self.filepath, './data_subscripts/*'))):
                    data = Script.load_data(f)
                    data_esr.append(data['data'])
                self.frequencies = data['frequency']

                # normalize
                norm = 'quantile'
                norm_parameter = 0.75
                if norm == 'mean':
                    norm_value = [np.mean(d) for d in data_esr]
                elif norm == 'border':
                    if norm_parameter > 0:
                        norm_value = [np.mean(d[0:norm_parameter]) for d in data_esr]
                    elif norm_parameter < 0:
                        norm_value = [np.mean(d[norm_parameter:]) for d in data_esr]
                elif norm == 'quantile':
                    norm_value = [np.quantile(d, norm_parameter) for d in data_esr]

                data_esr_norm = np.array([d / n for d, n in zip(data_esr, norm_value)])  # normalize and convert to numpy array


            # data_esr_norm = []
            # for d in data_esr:
            #     data_esr_norm.append(d / np.mean(d))

            angle = np.arange(len(data_esr_norm))
            print('<<<<<<<', self.frequencies.shape, angle.shape, data_esr_norm.shape)


            self.x_range = list(range(0, len(data_esr_norm)))

            self.status.emit('executing manual fitting')
            index = 0
            # for data in data_array:
            while index < self.NUM_ESR_LINES:
                #this must be after the draw command, otherwise plot doesn't display for some reason
                self.status.emit('executing manual fitting NV #' + str(index))
                self.plotwidget.axes.clear()
                self.plotwidget.axes.pcolor(self.frequencies, angle, data_esr_norm)
                # self.plotwidget.axes.imshow(data_esr_norm, aspect = 'auto', origin = 'lower')
                if self.interps:
                    for f in self.interps:
                        self.plotwidget.axes.plot(f(self.x_range), self.x_range)

                self.plotwidget.draw()

                while(True):
                    if self.queue.empty():
                        time.sleep(.5)
                    else:
                        value = self.queue.get()
                        if value == 'next':
                            while not self.peak_vals == []:
                                self.peak_vals.pop(-1)
                            # if len(self.single_fit) == 1:
                            #     self.fits[index] = self.single_fit
                            # else:
                            #     self.fits[index] = [y for x in self.single_fit for y in x]
                            index += 1
                            self.interps.append(f)
                            break
                        elif value == 'clear':
                            self.plotwidget.axes.clear()
                            self.plotwidget.axes.imshow(data_esr_norm, aspect='auto', origin = 'lower')
                            if self.interps:
                                for f in self.interps:
                                    self.plotwidget.axes.plot(f(self.x_range), self.x_range)
                            self.plotwidget.draw()
                        elif value == 'fit':
                            # peak_vals = sorted(self.peak_vals, key=lambda tup: tup[1])
                            peak_vals = np.array(self.peak_vals)
                            print('ggggg', peak_vals.shape)
                            y, x = peak_vals[:,0], peak_vals[:, 1]


                            # y,x = list(zip(*peak_vals))
                            #
                            # print('sdasda', x)
                            #
                            # # sort the list such that points are in creasing (in case we accidently clicked below a point)
                            # y = [elem for _, elem in sorted(zip(x, y))]

                            y = y[x.argsort()]
                            x = sorted(x)

                            f = UnivariateSpline(x, y)
                            x_range = list(range(0,len(data_esr_norm)))
                            self.plotwidget.axes.plot(f(x_range), x_range)
                            self.plotwidget.draw()
                        elif value == 'prev':
                            index -= 1
                            break
                        elif value == 'skip':
                            index += 1
                            break
                        elif type(value) is int:
                            index = int(value)
                            break

            self.finished.emit()
            self.status.emit('saving')
            self.plotwidget.axes.clear()

            angle = np.arange(len(data_esr_norm))
            # print('asdadf', self.frequencies)
            self.plotwidget.axes.pcolor(self.frequencies, angle, data_esr_norm)

            # self.plotwidget.axes.imshow(data_esr_norm, aspect='auto', origin = 'lower')
            if self.interps:
                for f in self.interps:
                    self.plotwidget.axes.plot(f(self.x_range), self.x_range)
            self.save()
            self.status.emit('saving finished')
def create_features(seg_id, seg, X):
    xc = pd.Series(seg['acoustic_data'].values)
    zc = np.fft.fft(xc)

    X.loc[seg_id, 'mean'] = xc.mean()
    X.loc[seg_id, 'std'] = xc.std()
    X.loc[seg_id, 'max'] = xc.max()
    X.loc[seg_id, 'min'] = xc.min()

    # FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)
    X.loc[seg_id, 'Rmean'] = realFFT.mean()
    X.loc[seg_id, 'Rstd'] = realFFT.std()
    X.loc[seg_id, 'Rmax'] = realFFT.max()
    X.loc[seg_id, 'Rmin'] = realFFT.min()
    X.loc[seg_id, 'Imean'] = imagFFT.mean()
    X.loc[seg_id, 'Istd'] = imagFFT.std()
    X.loc[seg_id, 'Imax'] = imagFFT.max()
    X.loc[seg_id, 'Imin'] = imagFFT.min()
    X.loc[seg_id, 'Rmean_last_5000'] = realFFT[-5000:].mean()
    X.loc[seg_id, 'Rstd__last_5000'] = realFFT[-5000:].std()
    X.loc[seg_id, 'Rmax_last_5000'] = realFFT[-5000:].max()
    X.loc[seg_id, 'Rmin_last_5000'] = realFFT[-5000:].min()
    X.loc[seg_id, 'Rmean_last_15000'] = realFFT[-15000:].mean()
    X.loc[seg_id, 'Rstd_last_15000'] = realFFT[-15000:].std()
    X.loc[seg_id, 'Rmax_last_15000'] = realFFT[-15000:].max()
    X.loc[seg_id, 'Rmin_last_15000'] = realFFT[-15000:].min()

    X.loc[seg_id, 'mean_change_abs'] = np.mean(np.diff(xc))
    X.loc[seg_id, 'mean_change_rate'] = np.mean(np.nonzero((np.diff(xc) / xc[:-1]))[0])
    X.loc[seg_id, 'abs_max'] = np.abs(xc).max()
    X.loc[seg_id, 'abs_min'] = np.abs(xc).min()

    X.loc[seg_id, 'std_first_50000'] = xc[:50000].std()
    X.loc[seg_id, 'std_last_50000'] = xc[-50000:].std()
    X.loc[seg_id, 'std_first_10000'] = xc[:10000].std()
    X.loc[seg_id, 'std_last_10000'] = xc[-10000:].std()

    X.loc[seg_id, 'avg_first_50000'] = xc[:50000].mean()
    X.loc[seg_id, 'avg_last_50000'] = xc[-50000:].mean()
    X.loc[seg_id, 'avg_first_10000'] = xc[:10000].mean()
    X.loc[seg_id, 'avg_last_10000'] = xc[-10000:].mean()

    X.loc[seg_id, 'min_first_50000'] = xc[:50000].min()
    X.loc[seg_id, 'min_last_50000'] = xc[-50000:].min()
    X.loc[seg_id, 'min_first_10000'] = xc[:10000].min()
    X.loc[seg_id, 'min_last_10000'] = xc[-10000:].min()

    X.loc[seg_id, 'max_first_50000'] = xc[:50000].max()
    X.loc[seg_id, 'max_last_50000'] = xc[-50000:].max()
    X.loc[seg_id, 'max_first_10000'] = xc[:10000].max()
    X.loc[seg_id, 'max_last_10000'] = xc[-10000:].max()

    X.loc[seg_id, 'max_to_min'] = xc.max() / np.abs(xc.min())
    X.loc[seg_id, 'max_to_min_diff'] = xc.max() - np.abs(xc.min())
    X.loc[seg_id, 'count_big'] = len(xc[np.abs(xc) > 500])
    X.loc[seg_id, 'sum'] = xc.sum()

    X.loc[seg_id, 'mean_change_rate_first_50000'] = np.mean(np.nonzero((np.diff(xc[:50000]) / xc[:50000][:-1]))[0])
    X.loc[seg_id, 'mean_change_rate_last_50000'] = np.mean(np.nonzero((np.diff(xc[-50000:]) / xc[-50000:][:-1]))[0])
    X.loc[seg_id, 'mean_change_rate_first_10000'] = np.mean(np.nonzero((np.diff(xc[:10000]) / xc[:10000][:-1]))[0])
    X.loc[seg_id, 'mean_change_rate_last_10000'] = np.mean(np.nonzero((np.diff(xc[-10000:]) / xc[-10000:][:-1]))[0])

    X.loc[seg_id, 'q95'] = np.quantile(xc, 0.95)
    X.loc[seg_id, 'q99'] = np.quantile(xc, 0.99)
    X.loc[seg_id, 'q05'] = np.quantile(xc, 0.05)
    X.loc[seg_id, 'q01'] = np.quantile(xc, 0.01)

    X.loc[seg_id, 'abs_q95'] = np.quantile(np.abs(xc), 0.95)
    X.loc[seg_id, 'abs_q99'] = np.quantile(np.abs(xc), 0.99)
    X.loc[seg_id, 'abs_q05'] = np.quantile(np.abs(xc), 0.05)
    X.loc[seg_id, 'abs_q01'] = np.quantile(np.abs(xc), 0.01)

    X.loc[seg_id, 'trend'] = add_trend_feature(xc)
    X.loc[seg_id, 'abs_trend'] = add_trend_feature(xc, abs_values=True)
    X.loc[seg_id, 'abs_mean'] = np.abs(xc).mean()
    X.loc[seg_id, 'abs_std'] = np.abs(xc).std()

    X.loc[seg_id, 'mad'] = xc.mad()
    X.loc[seg_id, 'kurt'] = xc.kurtosis()
    X.loc[seg_id, 'skew'] = xc.skew()
    X.loc[seg_id, 'med'] = xc.median()

    X.loc[seg_id, 'Hilbert_mean'] = np.abs(hilbert(xc)).mean()
    X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean()
    X.loc[seg_id, 'classic_sta_lta1_mean'] = classic_sta_lta(xc, 500, 10000).mean()
    X.loc[seg_id, 'classic_sta_lta2_mean'] = classic_sta_lta(xc, 5000, 100000).mean()
    X.loc[seg_id, 'classic_sta_lta3_mean'] = classic_sta_lta(xc, 3333, 6666).mean()
    X.loc[seg_id, 'classic_sta_lta4_mean'] = classic_sta_lta(xc, 10000, 25000).mean()
    X.loc[seg_id, 'Moving_average_700_mean'] = xc.rolling(window=700).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_1500_mean'] = xc.rolling(window=1500).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_3000_mean'] = xc.rolling(window=3000).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_6000_mean'] = xc.rolling(window=6000).mean().mean(skipna=True)
    ewma = pd.Series.ewm
    X.loc[seg_id, 'exp_Moving_average_300_mean'] = (ewma(xc, span=300).mean()).mean(skipna=True)
    X.loc[seg_id, 'exp_Moving_average_3000_mean'] = ewma(xc, span=3000).mean().mean(skipna=True)
    X.loc[seg_id, 'exp_Moving_average_30000_mean'] = ewma(xc, span=6000).mean().mean(skipna=True)
    no_of_std = 2
    X.loc[seg_id, 'MA_700MA_std_mean'] = xc.rolling(window=700).std().mean()
    X.loc[seg_id, 'MA_700MA_BB_high_mean'] = (
            X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean()
    X.loc[seg_id, 'MA_700MA_BB_low_mean'] = (
            X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean()
    X.loc[seg_id, 'MA_400MA_std_mean'] = xc.rolling(window=400).std().mean()
    X.loc[seg_id, 'MA_400MA_BB_high_mean'] = (
            X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean()
    X.loc[seg_id, 'MA_400MA_BB_low_mean'] = (
            X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean()
    X.loc[seg_id, 'MA_1000MA_std_mean'] = xc.rolling(window=1000).std().mean()

    X.loc[seg_id, 'iqr'] = np.subtract(*np.percentile(xc, [75, 25]))
    X.loc[seg_id, 'q999'] = np.quantile(xc, 0.999)
    X.loc[seg_id, 'q001'] = np.quantile(xc, 0.001)
    X.loc[seg_id, 'ave10'] = stats.trim_mean(xc, 0.1)

    for windows in [10, 100, 1000]:
        x_roll_std = xc.rolling(windows).std().dropna().values
        x_roll_mean = xc.rolling(windows).mean().dropna().values

        X.loc[seg_id, 'ave_roll_std_' + str(windows)] = x_roll_std.mean()
        X.loc[seg_id, 'std_roll_std_' + str(windows)] = x_roll_std.std()
        X.loc[seg_id, 'max_roll_std_' + str(windows)] = x_roll_std.max()
        X.loc[seg_id, 'min_roll_std_' + str(windows)] = x_roll_std.min()
        X.loc[seg_id, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01)
        X.loc[seg_id, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05)
        X.loc[seg_id, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95)
        X.loc[seg_id, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std))
        X.loc[seg_id, 'av_change_rate_roll_std_' + str(windows)] = np.mean(
            np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
        X.loc[seg_id, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max()

        X.loc[seg_id, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean()
        X.loc[seg_id, 'std_roll_mean_' + str(windows)] = x_roll_mean.std()
        X.loc[seg_id, 'max_roll_mean_' + str(windows)] = x_roll_mean.max()
        X.loc[seg_id, 'min_roll_mean_' + str(windows)] = x_roll_mean.min()
        X.loc[seg_id, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01)
        X.loc[seg_id, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05)
        X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95)
        X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean))
        X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = np.mean(
            np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
        X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()
예제 #34
0
    r.close()
    k.close()
    return(None)


if __name__ == "__main__":
    quart = args.groups
    treelist = LoadTrees(args.treefile, quart, args.outgroup, args.dlm)
    WriteTrees(treelist)
    taxdict = AgeAndSupport(treelist, quart)
    AgeStats(taxdict, quart)
    SupportStats(taxdict, quart)
    root_height, sum_support = FilterTree(treelist)
    distlist, splist = pairwiseDistance(treelist, quart)
    l = [i[0] for i in distlist]
    print("{} {}-{}".format(np.mean(l), np.quantile(l,.05),np.quantile(l,.95)))
    if args.windows:
        WindowStats(args.windows, taxdict, quart, root_height, sum_support, splist, distlist)

##test tree
#t='(rivulorum_F790:0.1188,(((longipalpusC_551_12634:6e-09,(longipalpusC_13:6e-09,(longipalpusC_16:6e-09,longipalpusC_551_12533:6e-09)0.921:6e-09)0.909:5e-09)0.014:0.00222105,(longipalpusC_15:6e-09,(longipalpusC_12:6e-09,(longipalpusC_11:0.00112065,(longipalpusC_4:6e-09,(parensis_KwaF762:5e-09,((parensis_KwaF761:6e-09,parensis_KwaF766:6e-09)0.92:6e-09,(parensis_KwaF767:0,parensis_KwaF768:0,parensis_KwaF769:0,parensis_KwaF835:0,parensis_KwaF851:0)1:6e-09)0:6e-09)0.583:6e-09)0.292:0.00110003)0:2.27e-07)0:5e-09)0.711:2.305e-05)0.955:0.00882347,(((((vaneedeni_KwaF782:6e-09,vaneedeni_KwaF780:6e-09)0.921:6e-09,vaneedeni_KwaF774:6e-09)0:6e-09,(vaneedeni_KwaF784:6e-09,vaneedeni_KwaF783:6e-09)0.767:6e-09)0.889:5e-09,(vaneedeni_KwaF775:6e-09,(vaneedeni_KwaF773:6e-09,vaneedeni_KwaF786:0.00112541)0.367:5e-09)1:7.08e-07)0.995:0.0102093,((funestuscf_MALAF105_7:0,funestuscf_MALAF99_4:0,funestuscf_MALF98_2:0)1:0.00447164,((funestus_MozF123:6e-09,((((funestus_MozF35:0,funestus_MozF804:0,funestus_Zam281:0)1:6e-09,funestus_TanF561:6e-09)0.936:5e-09,funestus_TanF601:6e-09)0.395:6e-09,funestus_MozF29:6e-09)0.405:0.00334382)0.646:6e-09,(funestus_GhaF264:6e-09,(funestus_Ken4590:6e-09,(funestus_GhaF265:6e-09,(funestus_Ugf399:6e-09,(funestus_Ugf403:6e-09,(funestus_MozF260:6e-09,funestus_Ugf401:6e-09)0.731:5e-09)0.85:6e-09)0.133:0.00222583)0.459:6e-09)0:5e-09)0.453:6e-09)0.894:0.00222803)0.789:6e-09)0.726:0.00356974)1:0.1188);'
#tree = PhyloTree(t)
#tree.set_species_naming_function(lambda node: node.name.split("_")[0])
#tree.set_outgroup( tree&'rivulorum_F790')
##
##tree.check_monophyly(["longipalpusC"], target_attr="species")
### 0 is bool
### 2 is problem nodes
##tree.get_monophyletic(values=["longipalpusC"], target_attr="species")
#tree.remove_child(child)
#tree.prune(nodes, preserve_branch_length=True)
예제 #35
0
    def evaluate(
        self,
        iter_unit,
        num_iter,
        batch_size,
        warmup_steps=50,
        log_every_n_steps=1,
        is_benchmark=False,
        export_dir=None,
    ):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError(
                '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])'
                % iter_unit)

        if self.run_hparams.data_dir is None and not is_benchmark:
            raise ValueError('`data_dir` must be specified for evaluation!')

        if hvd_utils.is_using_hvd() and hvd.rank() != 0:
            raise RuntimeError('Multi-GPU inference is not supported')

        estimator_params = {}

        image_classifier = self._get_estimator(
            mode='validation',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
            gpu_id=self.run_hparams.gpu_id)

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
                data_dir=self.run_hparams.data_dir,
                mode="validation",
                iter_unit=iter_unit,
                num_iter=num_iter,
                global_batch_size=batch_size,
            )

        else:
            num_epochs = 1
            num_decay_steps = -1
            num_steps = num_iter

        if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
                data_idx_dir=self.run_hparams.data_idx_dir, mode="validation")

        eval_hooks = []

        if hvd.rank() == 0:
            self.eval_logging_hook = hooks.BenchmarkLoggingHook(
                global_batch_size=batch_size,
                warmup_steps=warmup_steps,
                logging_steps=log_every_n_steps)
            eval_hooks.append(self.eval_logging_hook)

            print('Starting Model Evaluation...')
            print("Evaluation Epochs", num_epochs)
            print("Evaluation Steps", num_steps)
            print("Decay Steps", num_decay_steps)
            print("Global Batch Size", batch_size)

        def evaluation_data_fn():

            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    print("Using DALI input... ")

                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=False,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:
                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=False,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                print("Using Synthetic Data ...\n")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )

        try:
            eval_results = image_classifier.evaluate(
                input_fn=evaluation_data_fn,
                steps=num_steps,
                hooks=eval_hooks,
            )

            eval_throughput = self.eval_logging_hook.mean_throughput.value()
            eval_latencies = np.array(self.eval_logging_hook.latencies) * 1000
            eval_latencies_q = np.quantile(eval_latencies, q=[0.9, 0.95, 0.99])
            eval_latencies_mean = np.mean(eval_latencies)

            dllogger.log(data={
                'top1_accuracy': float(eval_results['top1_accuracy']),
                'top5_accuracy': float(eval_results['top5_accuracy']),
                'eval_throughput': eval_throughput,
                'eval_latency_avg': eval_latencies_mean,
                'eval_latency_p90': eval_latencies_q[0],
                'eval_latency_p95': eval_latencies_q[1],
                'eval_latency_p99': eval_latencies_q[2],
            },
                         step=tuple())

            if export_dir is not None:
                dllogger.log(data={'export_dir': export_dir}, step=tuple())
                input_receiver_fn = data_utils.get_serving_input_receiver_fn(
                    batch_size=None,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    dtype=self.run_hparams.dtype)

                image_classifier.export_savedmodel(export_dir,
                                                   input_receiver_fn)

        except KeyboardInterrupt:
            print("Keyboard interrupt")

        print('Model evaluation finished')
예제 #36
0
 def _uniform_sampler_(self, data, size, ax=-1):
     shape = np.mean(data, ax).shape + (size, )
     return lambda: np.quantile(data, 0.1, axis=-1)[
         ..., None] * np.random.rand(*list(shape)) + np.quantile(
             data, 0.9, axis=-1)[..., None]
예제 #37
0
geo_location_map_lat= geo_location_zip['Latitude']
geo_location_map_lat= geo_location_map_lat.to_dict()
geo_location_map_long = geo_location_zip['Longitude']
geo_location_map_long = geo_location_map_long.to_dict()
nyc_data['Longitude'] = nyc_data['ZIP CODE'].map(geo_location_map_long)
nyc_data['Latitude'] = nyc_data['ZIP CODE'].map(geo_location_map_lat)

# for visualization
nyc_data['BOROUGH'][nyc_data['BOROUGH']==1]='Manhattan'
nyc_data['BOROUGH'][nyc_data['BOROUGH']==2]='Bronx'
nyc_data['BOROUGH'][nyc_data['BOROUGH']==3]='Brooklyn'
nyc_data['BOROUGH'][nyc_data['BOROUGH']==4]='Queens'
nyc_data['BOROUGH'][nyc_data['BOROUGH']==5]='Staten Island'

# create bins
bins = [np.quantile(nyc_data['SALE PRICE'],0.2),np.quantile(nyc_data['SALE PRICE'],0.4), np.quantile(nyc_data['SALE PRICE'],0.5), np.quantile(nyc_data['SALE PRICE'],0.6), np.quantile(nyc_data['SALE PRICE'],0.80), np.quantile(nyc_data['SALE PRICE'],1)]
labels =['Very Low','Low', 'Medium', 'High', 'Very High']
nyc_data['SALE_PRICE_BIN'] = pd.cut(nyc_data['SALE PRICE'], labels=labels, bins =bins,include_lowest=False)

# visualization
plt.style.use('ggplot')
f, (ax1, ax2)= plt.subplots(2, figsize = [12,12])
fig1 = sns.scatterplot(x = 'Longitude', y = 'Latitude', hue = 'BOROUGH',
                style = 'BOROUGH',data=nyc_data,ax=ax1)
fig1.set_title('GEO REAL ESTATE MAP BY DISTRICT')
fig1.legend(loc = 'upper left')
fig2 = sns.scatterplot(x = 'Longitude', y = 'Latitude', hue = 'SALE_PRICE_BIN',
                style = 'SALE_PRICE_BIN',data=nyc_data, ax= ax2)
fig2.set_title('GEO REAL ESTATE MAP BY PRICE')
fig2.legend(loc = 'upper left')
plt.show()
예제 #38
0
def array_quantile_global(arr, q):
    return np.quantile(arr, q)
예제 #39
0
								if rls:
									receiveTime = int(rls[0].split()[0])
									delay = (receiveTime - sendTime) / 1000000.0
									delays.append(delay)
							if len(delays) != len(receiveLines):
								print("warning: did not find delay for all packets")
							if delays:
								avgdelays.append(np.mean(delays))		

					if pdrs:
						print(prot, pccr, ptbi, len(pdrs))
						pdrdata[(prot, pccr, ptbi)][0].append(x);
						pdrdata[(prot, pccr, ptbi)][1].append(np.mean(pdrs))
						pdrdata[(prot, pccr, ptbi)][2].append(np.mean(recs))
						#pdrdata[(prot, pccr, ptbi)][3].append(2*np.std(pdrs))
						pdrdata[(prot, pccr, ptbi)][3].append(np.quantile(pdrs, 0.05))
						pdrdata[(prot, pccr, ptbi)][4].append(np.quantile(pdrs, 0.95))
						
					if senderdcs:
						senderdcdata[(prot, pccr, ptbi)][0].append(x)
						senderdcdata[(prot, pccr, ptbi)][1].append(np.mean(senderdcs))
						#senderdcdata[(prot, pccr, ptbi)][2].append(2*np.std(senderdcs))
						senderdcdata[(prot, pccr, ptbi)][2].append(np.quantile(senderdcs, 0.05))
						senderdcdata[(prot, pccr, ptbi)][3].append(np.quantile(senderdcs, 0.95))
						recvrdcdata[(prot, pccr, ptbi)][0].append(x)
						recvrdcdata[(prot, pccr, ptbi)][1].append(np.mean(recvrdcs))
						#recvrdcdata[(prot, pccr, ptbi)][2].append(2*np.std(recvrdcs))
						recvrdcdata[(prot, pccr, ptbi)][2].append(np.quantile(recvrdcs, 0.05))
						recvrdcdata[(prot, pccr, ptbi)][3].append(np.quantile(recvrdcs, 0.95))
						
					if avgdelays: