Пример #1
0
def get_sms_feats(df):
    df['request_datetime'] = pd.to_datetime(df['request_datetime'])
    df["hour"] = df['request_datetime'].dt.hour
    df["day"] = df['request_datetime'].dt.day

    phone_no_m = df[["phone_no_m"]].copy()
    phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
    #对话人数和对话次数
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(sms_count="count",
                                                        sms_nunique="nunique")
    tmp["sms_rate"] = tmp["sms_count"] / tmp["sms_nunique"]
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    """短信下行比例
    """
    calltype2 = df[df["calltype_id"] == 2].copy()
    calltype2 = calltype2.groupby("phone_no_m")["calltype_id"].agg(
        calltype_2="count")
    phone_no_m = phone_no_m.merge(calltype2, on="phone_no_m", how="left")
    phone_no_m[
        "calltype_rate"] = phone_no_m["calltype_2"] / phone_no_m["sms_count"]
    """短信时间
    """
    tmp = df.groupby("phone_no_m")["hour"].agg(
        hour_mode=lambda x: stats.mode(x)[0][0],
        hour_mode_count=lambda x: stats.mode(x)[1][0],
        hour_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(
        day_mode=lambda x: stats.mode(x)[0][0],
        day_mode_count=lambda x: stats.mode(x)[1][0],
        day_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    return phone_no_m
Пример #2
0
 def actionUpdate(self):
     x = np.arange(-np.pi, np.pi, np.pi / 10)
     y = eval(str(self.ui.lineEdit.text()));
     self.mpl.axes.plot(x, y, '--rx', linewidth=2);
     self.mpl.axes.set_title('Sine Function');
     self.mpl.draw()
     print stats.mode([1, 2, 3, 3, 4, 5])
Пример #3
0
 def actionUpdate(self):
     x = np.arange(-np.pi, np.pi, np.pi / 10)
     if str(self.ui.lineEdit.text()) != "":
         y = eval(str(self.ui.lineEdit.text()))
         self.mpl.axes.plot(x, y, '--rx', linewidth=2)
         self.mpl.axes.set_title('Sine Function')
         self.mpl.draw()
         print stats.mode([1, 2, 3, 3, 4, 5])
Пример #4
0
    def stat_mode(df, cate_fea, num_fea):
        aim_ = pd.concat([
            df.groupby(cate_fea)[num_].agg(
                {
                    num_ + '_mode': lambda x: stats.mode(x)[0][0],
                    num_ + '_mode_count': lambda x: stats.mode(x)[1][0]
                }) for num_ in num_fea
        ],
                         axis=1)
        aim_.reset_index(inplace=True)

        return aim_
Пример #5
0
def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min())/2),
        present("Range", np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode", stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value", normaltest(np_values)[1])
        ]
    return output
Пример #6
0
def arrayStatistics(numpy_array, missing_value=N.nan):
    if N.isfinite(missing_value):
        valid_values = numpy_array[N.where(numpy_array!=missing_value)]
        if numpy_array.dtype.kind == 'f':
            valid_values = valid_values[N.where(N.isfinite(valid_values))]
    else:
        valid_values = numpy_array[N.where(N.isfinite(numpy_array))]

    if len(valid_values) > 0:
        statistics =  { 'min' : N.min(valid_values),
                        'max' : N.max(valid_values),
                        'mean' : N.mean(valid_values),
                        'stddev' : N.std(valid_values),
                        'median' : N.median(valid_values),
                        'mode' : scipy_stats.mode(valid_values),
                        'missing' : len(numpy_array) - len(valid_values),
                      }
    else:
        statistics =  { 'min' : missing_value, 'max' : missing_value,
                        'mean' : missing_value, 'stddev' : 0.0,
                        'median' : missing_value,
                        'mode' : ( N.array([missing_value,]),
                                   N.array([len(numpy_array),]) ),
                        'missing' : len(numpy_array),
                      }
    return statistics
def knn_classifier(X_train, y_train, X_validation, X_test, k):
    # Returns the labels for test_data, predicted by the k-NN clasifier trained on X_train and y_train
    # Input:
    # X_train - num_train x num_features matrix with features for the training data
    # y_train - num_train x 1 vector with labels for the training data
    # X_validation - num_test x num_features matrix with features for the validation data
    # X_test - num_test x num_features matrix with features for the test data
    # k - Number of neighbors to take into account
    # Output:
    # y_pred_validation - num_test x 1 predicted vector with labels for the validation data
    # y_pred_test - num_test x 1 predicted vector with labels for the test data

    X_test_val = np.vstack((X_validation, X_test))
    # Compute standardized euclidian distance of validation and test points to the other points
    D = cdist(X_test_val, X_train, metric='seuclidean')
    # Sort distances per row and return array of indices from low to high
    sort_ix = np.argsort(D, axis=1)
    # Get the k smallest distances
    sort_ix_k = sort_ix[:, :k]
    predicted_labels = y_train[sort_ix_k]
    # Predictions for each point is the mode of the K labels closest to the point
    predicted_labels = mode(predicted_labels, axis=1)[0]
    y_pred_validation = predicted_labels[:len(X_validation)]
    y_pred_test = predicted_labels[len(X_validation):]
    
    return y_pred_validation, y_pred_test
Пример #8
0
def get_diff(set_val,time_slots,num_type,conf_lev):

    time_slots_utc = dtime_to_unix(time_slots)
    TIMELET_INV_seconds = (time_slots[1]-time_slots[0]).seconds
    diff_mean = list()

    for r, utc_t in enumerate(time_slots_utc):
        utc_t_s = utc_t
        utc_t_e = utc_t + TIMELET_INV_seconds
        idx = np.nonzero((set_val[0] >= utc_t_s) & (set_val[0] < utc_t_e))[0]

        if len(idx) < 2:
            diff_val = np.inf
        else:
            temp_val = abs(np.diff(set_val[1][idx]))
            upper_val = np.sort(temp_val)[int(np.floor(len(temp_val)*conf_lev)):]
            if len(upper_val) == 0:
                 diff_val = np.inf
            else:
                if num_type == FLOAT_TYPE:
                    diff_val = np.mean(upper_val)
                elif num_type == INT_TYPE:
                    diff_val = int(stats.mode(upper_val)[0])
                else:
                    log.error('Sample type must either INT or FLOAT type')
                    raise NameError('Sample type must either INT or FLOAT type')

            #diff_val=max(abs(diff(set_val[1][idx])))
            #sort(abs(diff(set_val[1][idx])))[::-1]

        diff_mean.append(diff_val)

    #diff_mean=np.array(diff_mean)[:,np.newaxis]
    diff_mean = np.array(diff_mean)
    return diff_mean
Пример #9
0
def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min()) / 2),
        present("Range",
                np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode",
                stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value",
                normaltest(np_values)[1])
    ]
    return output
Пример #10
0
def predict(clf2, test_set):
    uid = pd.DataFrame()
    # test_set = processing(trainSpan=(1, 30), label=False)
    uid["user_id"] = test_set["user_id"]
    test_set = test_set.drop(labels=["user_id"], axis=1)
    # if isinstance(selector,RFECV):
    #     test_set_new = selector.transform(test_set.values)
    # elif isinstance(selector,list):
    #     test_set_new = test_set[selector]
    # else:
    #     test_set_new = test_set
    print("begin to make predictions")
    res = clf2.predict(test_set.values)
    uid["y_hat"] = pd.Series(res)
    uid["label"] = uid.groupby(by=["user_id"])["y_hat"].transform(lambda x: stats.mode(x)[0][0])
    str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
    uid_file = "result/uid_" + str_time + ".csv"
    uid.to_csv(uid_file,header=True,index=False)
    active_users = (uid.loc[uid["label"] == 1]).user_id.unique().tolist()
    print(len(active_users))
    print(active_users)
    str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
    submission_file = "result/submission_" + str_time + ".csv"
    with open(submission_file, "a", newline="") as f:
        writer = csv.writer(f)
        for i in active_users:
            writer.writerow([i])
Пример #11
0
    def fit_predict(self, X, y, **fit_params):
        self.X_ = X
        self.y_ = y

        # 这里为了尽可能用np的向量化简化计算,对训练集自身的预测通过滚动训练集的方法进行。
        X_roll = X
        distances = np.zeros((len(X), len(X) - 1))
        for i in range(len(self.X_) - 1):
            X_roll = np.roll(X_roll, -1, axis=0)
            # 通过对样本整体滚动计算样本两两之间的距离
            distances[:, i] = np.power(
                np.sum(np.power(X - X_roll, self.p), axis=1), 1 / self.p)
        min_k_ind = np.argpartition(distances, self.k, axis=1)[:, :self.k]
        # distances的第[i, j]个元素为x[i]与x[(i + j + 1) % len(X)]的距离,因此需要调整一下
        neighbors_ind = (min_k_ind +
                         np.arange(1,
                                   len(X) + 1).reshape(-1, 1)) % len(X)
        neighbors_labels = y[neighbors_ind]

        # 等权预测
        if self.weights == 'uniform':
            return stats.mode(neighbors_labels, axis=1).mode.flatten()

        # 加权预测
        neighbors_distances = np.vstack(
            (distances[i][min_k_ind[i]] for i in range(len(X))))
        # 距离倒数
        if self.weights == 'inverse':
            weights_ = 1 / neighbors_distances
        # 距离的高斯函数
        if self.weights == 'Gaussian':
            weights_ = np.exp(-np.square(neighbors_distances) / 2)
        return self.weighted_predict(neighbors_labels, weights_)
Пример #12
0
 def extractfromfits(self, filename, loc, size, sky=0.):
     """
     Extracts a subregion from a fits file  and converts it according
     to the astro and math convention: pixel (0,0) is at the bottom left
     """
     x, y = loc
     radius = int(size/2)
     r = size-radius * 2
     hdulist = pyfits.open(filename)  # open a FITS file
     if len(hdulist) != 1:
         raise RuntimeError, "extractfromfits : len(hdulist) > 1 not allowed"
     fulldata = hdulist[0].data       # assumes the first extension is an image
     if x< 0 or y< 0 or x>= fulldata.shape[1] or y>= fulldata.shape[0]:
         raise RuntimeError, "extractfromfits : bad extraction parameters"
     if x+radius+r >= fulldata.shape[1] or y+radius+r >= fulldata.shape[0] or x-radius<0 or y-radius<0:
         #TODO: set outside pixels to NaN
         print "outside"
     self.array = self.array + np.zeros(self.array.shape, dtype=np.float64)    # switch to 8 byte   
     self.array = fulldata[y-radius:y+radius+r, x-radius:x+radius+r].transpose()            # get values from the subsection 
     #    This tansposition makes the pixelarray coordinates (x,y) equal to those in the ds9 display etc.
     #    In other words, we are in the math and astro convention.
     #    x = horizontal, y = vertical, (0, 0) is bottom left.
     self.array[np.where(np.isnan(self.array))] = sky
     hdulist.close()
     if sky is None:
         #TODO: check...
         self.array -= stats.mode(self.array.ravel())[0][0]
     else:
         self.array -= sky
     self.setzscale()
Пример #13
0
 def create_dataset(X, y, time_steps=1, step=1):
     Xs, ys = [], []
     for i in range(0, len(X) - time_steps, step):
         v = X.iloc[i:(i + time_steps)].values
         labels = y.iloc[i:i + time_steps]
         Xs.append(v)
         ys.append(stats.mode(labels)[0][0])
     return np.array(Xs), np.array(ys).reshape(-1, 1)
Пример #14
0
 def unsupervised_habitat_class_modes(self):
     hcm = {}
     for hab in self.habitats:
         md, cn = mode( self.unsupervised_habitat_class_dict[hab] )
         if len( md )==1:
             hcm[hab] = md[0]
         else:
             hcm[hab] = None
     return hcm
Пример #15
0
 def unsupervised_habitat_class_modes(self):
     hcm = {}
     for hab in self.habitats:
         md, cn = mode(self.unsupervised_habitat_class_dict[hab])
         if len(md) == 1:
             hcm[hab] = md[0]
         else:
             hcm[hab] = None
     return hcm
Пример #16
0
def calculate_weighted_loss(data):
    wighted_loss = 0
    if data.shape[0] != 0:
        label_column = data[:, -1]
        label_column = np.array(label_column.tolist())
        y_prediction_mode = stats.mode(label_column, axis=0)[0][0]
        y_prediction = [y_prediction_mode] * label_column.shape[0]
        wighted_loss = ch.custom_weighted_loss(label_column, np.array(y_prediction))
    return wighted_loss
Пример #17
0
    def Aggregate(self,
                  sourceRaster,
                  outFName,
                  method=None,
                  numSourcePerTarget=10):

        tmpOutput = os.path.splitext(outFName)[0] + ".tif"
        tmpOutDataset = SpatialUtilities.SAHMRaster(tmpOutput)
        tmpOutDataset.pullParamsFromRaster(self.templateRaster.source)
        tmpOutDataset.createNewRaster()

        rows = int(sourceRaster.height)
        cols = int(sourceRaster.width)

        # loop of 'blocks' of data maybe.
        bSize = 2048  # source pixels
        # convert this to the nearest whole number of target pixels
        bSize = int(round(bSize / numSourcePerTarget) * numSourcePerTarget)
        if bSize == 0:
            bSize = int(numSourcePerTarget)

        for i in range(0, rows, bSize):
            if i + bSize < rows:
                numRows = bSize
            else:
                numRows = rows - i

            for j in range(0, cols, bSize):
                if j + bSize < cols:
                    numCols = bSize
                else:
                    numCols = cols - j

                data = sourceRaster.getBlock(j, i, numCols, numRows)

                if method == None:
                    method = "Mean"
                if method in ["Mean", "Max", "Min", "STD"]:
                    ans = self.rebin(data, (numRows / numSourcePerTarget,
                                            numCols / numSourcePerTarget),
                                     method)
                else:
                    X, Y = data.shape
                    x = X // numSourcePerTarget
                    y = Y // numSourcePerTarget
                    ndMask = data.reshape(
                        (x, numSourcePerTarget, y, numSourcePerTarget))
                    ndMask = ndMask.transpose([0, 2, 1, 3])
                    ndMask = ndMask.reshape(
                        (x * y, numSourcePerTarget * numSourcePerTarget))
                    ans = np.array(stats.mode(ndMask, 1)[0]).reshape(x, y)

                tmpOutDataset.putBlock(ans, int(j / numSourcePerTarget),
                                       int(i / numSourcePerTarget))

        tmpOutDataset.calcStats()
        tmpOutDataset.close()
def HMM(X, Y, human_bounds):
    """fit hidden markov model
  
       Fit HMM to average data and cross-validate with leftout subject using within song and between song average correlations              

       Parameters
       ----------
       A: voxel by time ndarray (2D)
       B: voxel by time ndarray (2D)
       C: voxel by time ndarray (2D)
       D: voxel by time ndarray (2D)
       K: # of events for HMM (scalar)
 
       Returns
       -------
       z: z-score after performing permuted cross-validation analysis      

    """

    # Fit to all but one subject
    nPerm = 1000
    within_across = np.zeros(nPerm + 1)
    K = len(human_bounds) + 1
    nTR = X.shape[1]

    # create events vector out of human bounds
    add_zero = np.append([0], human_bounds)
    full_human_bounds = np.append(add_zero, [nTR])
    diff_bounds = np.diff(full_human_bounds)
    events = np.zeros((nTR))

    for l in range(len(diff_bounds)):
        events[full_human_bounds[l]:full_human_bounds[l + 1]] = l

    max_event_length = stats.mode(events)[1][0]

    # compute timepoint by timepoint correlation matrix
    cc = np.corrcoef(Y.T)  # Should be a time by time correlation matrix

    # Create a mask to only look at values up to max_event_length
    local_mask = np.zeros(cc.shape, dtype=bool)
    for k in range(1, max_event_length):
        local_mask[np.diag(np.ones(cc.shape[0] - k, dtype=bool), k)] = True

    for p in range(nPerm + 1):
        same_event = events[:, np.newaxis] == events
        within = fisher_mean(cc[same_event * local_mask])
        across = fisher_mean(cc[(~same_event) * local_mask])
        within_across[p] = within - across

        np.random.seed(p)
        events = np.zeros(nTR, dtype=np.int)
        events[np.random.choice(nTR, K - 1, replace=False)] = 1
        events = np.cumsum(events)

    return within_across
def HMM(X, Y, human_bounds):
    """fit hidden markov model
  
       Fit HMM to average data and cross-validate with leftout subject using within song and between song average correlations              

       Parameters
       ----------
       A: voxel by time ndarray (2D)
       B: voxel by time ndarray (2D)
       C: voxel by time ndarray (2D)
       D: voxel by time ndarray (2D)
       K: # of events for HMM (scalar)
 
       Returns
       -------
       z: z-score after performing permuted cross-validation analysis      

    """

    # Fit to all but one subject
    nPerm = 1000
    within_across = np.zeros(nPerm + 1)
    K = len(human_bounds) + 1
    nTR = X.shape[1]
    ev = brainiak.eventseg.event.EventSegment(K,
                                              split_merge=True,
                                              split_merge_proposals=3)
    ev.fit(X.T)
    events = np.argmax(ev.segments_[0], axis=1)
    bounds = np.where(np.diff(np.argmax(ev.segments_[0], axis=1)))[0]
    _, event_lengths = np.unique(events, return_counts=True)
    max_event_length = stats.mode(events)[1][0]

    # compute timepoint by timepoint correlation matrix
    cc = np.corrcoef(Y.T)  # Should be a time by time correlation matrix

    # Create a mask to only look at values up to max_event_length
    local_mask = np.zeros(cc.shape, dtype=bool)
    for k in range(1, max_event_length):
        local_mask[np.diag(np.ones(cc.shape[0] - k, dtype=bool), k)] = True

    for p in range(nPerm + 1):
        same_event = events[:, np.newaxis] == events
        within = fisher_mean(cc[same_event * local_mask])
        across = fisher_mean(cc[(~same_event) * local_mask])
        within_across[p] = within - across

        np.random.seed(p)
        perm_lengths = np.random.permutation(event_lengths)
        events = np.zeros(nTR, dtype=np.int)
        events[np.cumsum(perm_lengths[:-1])] = 1
        events = np.cumsum(events)

    return within_across, bounds
Пример #20
0
def test_kpi_level_model(predict_result_file, final_result_file):
    df = pd.read_csv(predict_result_file, sep=',', dtype=str)
    df = df[df['predict_event'] == '1']
    mapping_dict = {'Biz': 0, 'Mon': 1, 'Ora': 2, 'Trd': 3, 'Other': 4}
    knn_model_list = []
    knn_model_list = kpi_level_model.test_KNN_model(cluster_data_dir)
    all_df = pd.DataFrame(columns=[
        'alertgroup', 'classifier', 'hostname', 'predict_event',
        'predict_level'
    ])
    for alertgroup, group in df.groupby('alertgroup'):
        column_list = [
            'cpu_max', 'cpu_min', 'mem_max', 'mem_min', 'cpu_max_1',
            'cpu_min_1', 'mem_max_1', 'mem_min_1', 'cpu_max_2', 'cpu_min_2',
            'mem_max_2', 'mem_min_2'
        ]
        data = group[column_list]
        kpi_predict_result = []
        for i in knn_model_list:
            kpi_predict_result.append(i.predict(data))
        print(kpi_predict_result)
        predict_results = np.zeros(len(group))
        df_res = pd.DataFrame(columns=['predict_level'])
        for idx in range(len(group)):
            sample_predict_vec = np.array([
                np.round(kpi_predict_result[0][idx]),
                np.round(kpi_predict_result[1][idx]),
                np.round(kpi_predict_result[2][idx]),
                np.round(kpi_predict_result[3][idx]),
                np.round(kpi_predict_result[4][idx])
            ])
            # print(sample_predict_vec)
            mode_prediction_res = stats.mode(sample_predict_vec)[0][
                0]  # 5个模型预测结果的众数
            print(mode_prediction_res)
            max_prediction_res = sample_predict_vec[np.argmax(
                sample_predict_vec)]  # 5个模型预测结果的最大值
            print(max_prediction_res)
            group_prediction_res = sample_predict_vec[mapping_dict[
                alertgroup]]  # group_prediction_val <= max_prediction_val, 该条数据对应的业务模型预测的结果
            print(group_prediction_res)
            if (mode_prediction_res <= 2 and max_prediction_res <= 2):
                predict_results[idx] = group_prediction_res
            else:
                predict_results[idx] = max_prediction_res
            df_res.loc[idx] = int(predict_results[idx])

        new_df = group[[
            'alertgroup', 'classifier', 'hostname', 'predict_event'
        ]].reset_index(drop=True).join(df_res, how='outer')
        all_df = pd.concat([all_df, new_df])

    print(all_df)
    all_df.to_csv(final_result_file, sep=',', index=False)
Пример #21
0
def average_predictions(models, test_features, np_value="int", method="mode"):
    predictions = np.column_stack([model.predict(test_features) for model in models])
    print(f"Predictions of first 5 rows: {predictions[:5]}")
    if method == "mode":
        averaged_predictions = stats.mode(predictions, axis=1)[0].astype(np_value)
    elif method == "mean":
        averaged_predictions = np.average(predictions, axis=1).astype(np_value)
    else:
        raise Exception("Method undefined")
    print(f"Averaged predictions of first 5 rows: {averaged_predictions[:5]}")
    return np.hstack(averaged_predictions)
Пример #22
0
    def predict(self, examples):
        X = vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] )

        dist, ind = self.lsh.kneighbors(X)

        rows, columns = ind.shape
        for row in xrange(0, rows):
            for column in xrange(0, columns):
                ind[row, column] = self.Y[ind[row, column]]
                
        vals, counts = mode(ind, axis=1)
        
        return reshape(vals, (1, len(examples))).tolist()[0]
Пример #23
0
def info_univariate(data, features_name):
    df_np = np.array(data)
    df_transposed = np.transpose(d)
    for f in range(0, len(df_transposed), 1):
        ds = sorted(df_transposed[f])
        moda = stats.mode(ds)
        print(
            'Feature: {}:\nMAX: --> {}\nMIN:  --> {}\nAVG:  --> {}\nMODE:  --> V:{} --> {}\nMed  --> {}\n'
            .format(features_name[f], np.max(df_transposed[f]),
                    np.min(df_transposed[f]),
                    round(np.mean(df_transposed[f]), 1), moda[0], moda[1],
                    np.median(ds)))
    plot_boxnotch_univariateanalysis(df_transposed, features_name)
    return
def ensemble_voting(predictions, gold, dataset):
    stacked = numpy.stack(predictions, axis=0)
    modals = stats.mode(stacked, axis=0)[0].squeeze().astype(int)

    if dataset != "test":
        accuracy = acc(gold, modals)
        f1 = f1_macro(gold, modals)
        print("acc: ", accuracy)
        print("f1: ", f1)
    else:
        accuracy = 0
        f1 = 0

    return modals, accuracy, f1
Пример #25
0
def density_categorical_accuracy(labels, predicted_labels, classes):
    assert (len(labels) == len(predicted_labels))
    if len(labels) == 0:
        return 0

    n_cluster = np.max(predicted_labels) + 1
    clusters = [[] for _ in range(n_cluster)]

    for label, predicted_label in zip(labels, predicted_labels):
        clusters[predicted_label].append(label)

    catacc = np.average([stats.mode(d)[1][0] / len(d) for d in clusters],
                        weights=[len(d) for d in clusters])
    corrected_catacc = (catacc - 1.0 / classes) / (1.0 - 1.0 / classes)

    return corrected_catacc
Пример #26
0
def get_qda_oof_prediction(x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)):
        model = QuadraticDiscriminantAnalysis()
        y_tr = y_train[train_ind]
        x_tr = x_train[train_ind]
        x_ts = x_train[test_ind]
        model.fit(x_tr,y_tr)
        oof_train[test_ind] = model.predict(x_ts)
        oof_test_skf[i,:] = model.predict(x_test)
        print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind])))        
    oof_test = stats.mode(oof_test_skf,axis=0)[0]
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
Пример #27
0
    def predict(self, X):
        distances = np.zeros((len(X), len(self.X_)))
        for i in range(len(X)):
            distances[i, :] = np.power(
                np.sum(np.power(X[i] - self.X_, self.p), axis=1), 1 / self.p)
        neighbors_ind = np.argpartition(distances, self.k, axis=1)[:, :self.k]
        neighbors_labels = self.y_[neighbors_ind]

        if self.weights == 'uniform': return stats.mode(neighbors_labels).mode

        neighbors_distances = np.vstack(
            (distances[i][neighbors_ind[i]] for i in range(len(X))))
        if self.weights == 'inverse':
            weights_ = 1 / neighbors_distances
        if self.weights == 'Gaussian':
            weights_ = np.exp(-np.square(neighbors_distances) / 2)
        return self.weighted_predict(neighbors_labels, weights_)
Пример #28
0
def get_feature(data_dict_samples,num_type):
    x_temp=[]
    for i,sample in enumerate(data_dict_samples):
        # If sample=[], np.std returns 0. Avoid zero std, add a infitestimal number
        if len(sample)==0: # Set infty if no sample is availble
            x_temp.append(np.inf)                

        else:
            if num_type==INT_TYPE:
                x_temp.append(int(stats.mode(sample)[0]))                
            elif num_type==FLOAT_TYPE:
                x_temp.append(np.mean(sample))                
            else:
                raise NameError('Sample type must either INT or FLOAT type')

    x_temp=np.array(x_temp)[:,np.newaxis]
    return x_temp
    def fit(self, trainExamples):
        self.expectedValues = {}
        
        for x in trainExamples:
            for (key, value) in x.items():
                if key != "Image":
                    if not key in self.expectedValues:
                        self.expectedValues[key] = []
                    
                    if len(value) > 0:
                        self.expectedValues[key].append(round(float(value)/0.5,0)*0.5)
                        
        for key in self.expectedValues.keys():
            self.expectedValues[key], _ = mode(self.expectedValues[key])
            self.expectedValues[key] = self.expectedValues[key][0]
            

        return self
Пример #30
0
def get_create_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    # feature['create_count'] = len(row)
    diff_day = np.diff(row['day'])
    if len(diff_day) != 0:
        # feature['create_day_diff_mean'] = np.mean(diff_day)
        # feature['create_day_diff_std'] = np.std(diff_day)
        # feature['create_day_diff_min'] = np.min(diff_day)
        # feature['create_day_diff_mode'] = stats.mode(interval_data)[0][0]
        feature['create_day_diff_ske'] = stats.skew(diff_day)
        feature['create_day_diff_kur'] = stats.kurtosis(diff_day)
        # feature['create_day_diff_max'] = np.max(diff_day)
        feature['create_day_last'] = diff_day[-1]
        feature['create_sub_register'] = np.subtract(np.max(row['max_day']),
                                                     np.max(row['day']))
        feature['create_mode'] = stats.mode(row['day'])[0][0]
        return feature
Пример #31
0
def get_sgd_oof_prediction(SEED,x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)):
        model = SGDClassifier(max_iter=100,random_state=SEED,loss="squared_hinge",alpha=0.009,penalty='l1')
        y_tr = y_train[train_ind]
        scaler = StandardScaler()
        x_tr = scaler.fit_transform(x_train[train_ind])
        x_ts = scaler.transform(x_train[test_ind])
        x_test_s = scaler.transform(x_test)
        model.fit(x_tr,y_tr)
        oof_train[test_ind] = model.predict(x_ts)
        oof_test_skf[i,:] = model.predict(x_test_s)
        print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind])))
        
    oof_test = stats.mode(oof_test_skf,axis=0)[0]
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
Пример #32
0
def get_log_oof_prediction(SEED,x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)):
        model = LogisticRegression(random_state=SEED,C=0.8252042855888113,penalty='l1',verbose=2)
        y_tr = y_train[train_ind]
        scaler = StandardScaler()
        x_tr = scaler.fit_transform(x_train[train_ind])
        x_ts = scaler.transform(x_train[test_ind])
        x_test_s = scaler.transform(x_test)
        model.fit(x_tr,y_tr)
        oof_train[test_ind] = model.predict(x_ts)
        oof_test_skf[i,:] = model.predict(x_test_s)
        print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind])))
        
    oof_test = stats.mode(oof_test_skf,axis=0)[0]
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
Пример #33
0
    def process(img, head, gray_old):
        if shot_frame:
            cv2.imwrite(folder_depth + head + f"{loop:05d}.jpg", img)
        img = cv2.resize(img, resize_shape, interpolation=cv2.INTER_LINEAR)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = cv2.medianBlur(gray, 3)
        if mix_image:
            diff = cv2.absdiff(gray, gray_old)
            th = stats.mode(diff, axis=None)[0][0]
            diff_valid = diff > th + gray_diff_th
            if gray_mode:
                diff[np.logical_not(diff_valid)] = 0
                return diff, gray

            if np.sum(diff_valid) > change_th: return None, gray
            return gray, gray
        elif gray_mode:
            return gray, gray
        return img, gray
Пример #34
0
def split_data_into_steps(data, N_TIME_STEPS, N_FEATURES, step, RANDOM_SEED):
    segments = []
    labels = []
    activities = set()
    for i in range(0, len(data) - N_TIME_STEPS, step):
        xs = data['x'].values[i: i + N_TIME_STEPS]
        ys = data['y'].values[i: i + N_TIME_STEPS]
        zs = data['z'].values[i: i + N_TIME_STEPS]
        label = stats.mode(data['activity'][i: i + N_TIME_STEPS])[0][0]
        segments.append([xs, ys, zs])
        labels.append(label)
        activities.add(label)

    reshaped_segments = pd.np.asarray(segments, dtype=pd.np.float32).reshape(-1, N_TIME_STEPS, N_FEATURES)
    labels = pd.np.asarray(pd.get_dummies(labels), dtype=pd.np.float32)

    validation_split = get_validation_split()
    x_train, x_test, y_train, y_test = train_test_split(reshaped_segments, labels, test_size=validation_split,
                                                        random_state=RANDOM_SEED)
    return x_train, x_test, y_train, y_test, activities
Пример #35
0
def test_voting(model, ipca, tensor, labels, times, interval_len, cnt):
    ok = 0.
    total = 0.
    with tf.Session() as sess:
        for i in range(cnt):
            x, y = prepate_data_for_voting(sess, ipca, tensor, labels, times, interval_len)
            num, width, temp, feat = x.shape
            x = np.reshape(x, (-1, x.shape[2] * x.shape[3]))

            x = (x - ipca.mean_my) / ipca.var_my
            x = ipca.transform(x)
            y_pred = model.predict(x)
            y_pred = np.reshape(y_pred, (-1, width))
            y_pred = stats.mode(y_pred, axis=1)[0]
            y_pred = np.reshape(y_pred, (-1))
            y = np.argmax(y, axis=1)
            ok += np.sum(y == y_pred, axis=0)
            total += y.shape[0]

    return ok / total
Пример #36
0
def get_feature(data_dict_samples,num_type):
    x_temp = []
    for i, sample in enumerate(data_dict_samples):

        # If sample=[], np.std returns 0. Avoid zero std, add a infitestimal number

        # Set infty if no sample is availble
        if len(sample) == 0:
            x_temp.append(np.inf)                

        else:
            if num_type == INT_TYPE:
                x_temp.append(int(stats.mode(sample)[0]))                
            elif num_type == FLOAT_TYPE:
                x_temp.append(np.mean(sample))                
            else:
                raise NameError('Sample type must either INT or FLOAT type')

    x_temp = np.array(x_temp)[:, np.newaxis]
    return x_temp
Пример #37
0
 def train(self, input_data, target_data):
     (self._most_frequent_value, ), _ = mode(target_data, axis = 0)
     self._target_type = target_data.dtype
Пример #38
0
    #A special column
    #This would be obsolete with:
    #from matplotlib.dates import MONDAY, MonthLocator, WeekdayLocator, DateFormatter
    time = getTime(DateTimeUT)
    
    #Just an empty class
    class Dummy(object): pass
    
    #Lets get some statistics
    tdfst = Dummy()
    tdfStat = []
    
    tdfst.name, tdfst.median, tdfst.max, tdfst.min, tdfst.mean, tdfst.stdev = \
    "Seeing", median(medianFWHM), max(medianFWHM), min(medianFWHM), mean(medianFWHM), std(medianFWHM)
    
    tdfst.mode = mode(medianFWHM)
 
    tdfStat.append(tdfst)
    
    if opts.verbose == True:
        print
        print ("%5s" + "%11s"*6) % ("Name", "Median", "Max", "Min", "Mean", "Stdev", "Mode")
        frmt = "%5s" + "%11.2f"*6
        print frmt % (tdfst.name, tdfst.median, tdfst.max, tdfst.min, tdfst.mean, tdfst.stdev, tdfst.mode[0])
        print
    
    #Calculates some 2D correlations
    WDCorr = spearmanr(medianFWHM, WindDirectionDeg)
    Humidity = spearmanr(medianFWHM, HumidityPercent)
    Pressure = spearmanr(medianFWHM, PressureHPA)
Пример #39
0
def open_behavioural(path, subj, **kwargs):
    ############# BOLOGNA ##################
    
    dropped_trials = []
    behavioural_data = []
    for arg in kwargs:
        if arg == 'dropped_trials':
            dropped_trials = np.int_(kwargs[arg].split(','))
        if arg == 'behavioural_data':
            behavioural_data = kwargs[arg].split(',')
    
    import xlrd
    fn = os.path.join(path, subj)
    
    book = xlrd.open_workbook(fn) #Open workbook
    sh = book.sheet_by_index(0) #Choose sheet
    
    labels = sh.row_values(0)
    labels = [unicode.lower(unicode(l)) for l in labels]
    l_array = np.array(labels, dtype = np.str)
    
    indexes = []
    data_tot = []
    dtype = []
    for field in behavioural_data:
        index = np.nonzero(l_array == str.lower(field))[0][0]
        
        data = sh.col_values(int(index))[1:]
        print field
        type_ = mode([x.__class__ for x in data])[0][0]
        if type_ == unicode or type == str:
            data = [x.__class__.lower(x) for x in data]
            t = (field, np.str_, 45)
        else:
            #print data
            data = [(int(x) if (x != 'NULL') and (x != '') else 0) for x in data]
            t = (field, np.int_, 1)
        
        dtype.append(t)
        data_tot.append(data)
    
    data_tot.append(range(1,len(sh.col_values(0)[1:])+1))
    dtype.append(('TrialNo.', np.int_, 1))
    
    '''    
    behavioural = np.array(zip(
                               sh.col_values(6)[1:], #Condition Label
                               sh.col_values(19)[1:],
                               np.float_([(int(x) if x else 0) for x in sh.col_values(18)[1:]]), #Accuracy
                               np.int_([(int(x) if x else 0) for x in sh.col_values(4)[1:]]),
                               np.arange(len(sh.col_values(0)[1:]))+1 #Combination
                            ), 
                           dtype=[('Condition', np.str_,2),
                                  ('SlideImage', np.str_,10),
                                  ('Accuracy', np.int_, 1),
                                  ('Combination', np.int_, 1),
                                  ('TrialNo.', np.int_, 1)]
                           )
    '''
    
    behavioural = np.array(zip(*data_tot), dtype=dtype)
    
    
    if len(dropped_trials) > 0:
        mask = 0
        for trial in dropped_trials:
            mask = mask + np.int_(behavioural['TrialNo.'] == trial)
    
        behavioural = behavioural[~np.bool_(mask)]
    
    return behavioural
Пример #40
0
Southwest, 4.79, 2.71
Wales, 5.27, 3.53
Scotland, 6.08, 4.51
Northern Ireland, 4.02, 4.56'''

data = data.splitlines()
data = [i.split(', ') for i in data]

column_names = data[0]
data_rows = data[1::]
df = pd.DataFrame(data_rows, columns=column_names)

df['Alcohol'] = df['Alcohol'].astype(float)
df['Tobacco'] = df['Tobacco'].astype(float)

print "The mean for the Alcohol dataset is", df['Alcohol'].mean() 
print  "The median for the Alcohol dataset is", df['Alcohol'].median() 
# NOT SURE HOW TO MODIFY THE SCRIPT TO ONLY RETURN THE FIRST ARRAY
# ALSO NOT SURE HOW TO IMPROVE UPON THE VALUE RETURNED SO THAT IF THE FREQUENCY ARRAY RETURNED IS 1, "there is no mode" is the response returned.
print  "The mode for the Alcohol dataset is", stats.mode(df['Alcohol']) 
print "The range for the Alcohol dataset is", max(df['Alcohol']) - min(df['Alcohol'])
print "The standard deviation for the Alcohol dataset is", df['Alcohol'].std() 
print "The variance for the Alcohol dataset is", df['Alcohol'].var() 

print  "The mean for the Tobacco dataset is", df['Tobacco'].mean() 
print "The median for the Tobacco dataset is", df['Tobacco'].median() 
print "The mode for the Tobacco dataset is", stats.mode(df['Tobacco']) 
print "The range for the Tobacco dataset is", max(df['Tobacco']) - min(df['Tobacco'])
print "The standard deviation for the Tobacco dataset is", df['Tobacco'].std() 
print "The variance for the Tobacco dataset is", df['Tobacco'].var()