def test_train(x_yarr, offset=0, x_split=0.9, nshuffles=1, x_cols=1, label_cols=-1, norm=True): """ split the data into train and test data Args: x_yarr (numpy.array): An array contianing the data to seperate offset (int): the offset of the data to use as test, if 0, the end of the array is used x_split (float): percent of th data to use as training nshuffles (int): how many times to shuffle data x_cols (int): the colomn axis starting point of data in x_yarr label_cols (int): the colomn axis starting point of labels in x_yarr norm (bool): normalize the data if True Returns: x_train (numpy.array): x colomns for training x_test (numpy.array): x colomns for testing y_train (numpy.array): y colomns for training y_test (numpy.array): y colomns for testing test_data (numpy.array): all the columns of test_data (both x_test and y_test) m_t (numpy.array): mean of the columns s_t (numpy.array): std of the columns """ for i in range(nshuffles): np.random.shuffle(x_yarr) if offset == 0: train_size = int(x_yarr.shape[0] * x_split) train_data = x_yarr[:train_size] test_data = x_yarr[train_size:] else: test_size = int(x_yarr.shape[0] * (1 - x_split)) test_data = x_yarr[offset * test_size:(offset + 1) * test_size] train_data = np.array(x_yarray, copy=True) train_data = np.delete(train_data, slice(offset * test_size, (offset + 1) * test_size), axis=0) if norm: x_train, m_t, s_t = normalize(train_data[:, 1:-1]) x_test, m_t, s_t = normalize(test_data[:, 1:-1], m_t, s_t) test_data[:, 1:-1] = x_test else: x_train = train_data[:, 1:-1] x_test = test_data[:, 1:-1] m_t = None s_t = None y_train = train_data[:, label_cols] y_test = test_data[:, label_cols] return x_train, x_test, y_train, y_test, test_data, m_t, s_t
def convert_to_corpus(name, rows): file = join(dirname(dirname(__file__)), "data", "{}.txt".format(name)) f = open(file, "w") for row in rows: label = '__label__' + row['label'].replace(" ", "_") text = row['text'].replace("\r\n", " ") print(normalize(text)) text = normalize(text) f.write(label + " " + text + "\n")
def generate_stat_feature(station_id: int, forecast_date: pd.Timestamp, df: pd.DataFrame, days, name): """ Using assigned length of history data to fetch statistic features. """ history_list = [] for i in range(days, 0, -1): try: history_list.append(df.loc[station_id, forecast_date - pd.DateOffset(days=i)].iloc[:24]) except KeyError: pass h = normalize(pd.concat(history_list + [df.loc[station_id, forecast_date]])) h_max = h.max() h_min = h.min() h_mean = h.mean() h_var = h.var() h_max.index = h_max.index + '_{}_max'.format(name) h_min.index = h_min.index + '_{}_min'.format(name) h_mean.index = h_mean.index + '_{}_mean'.format(name) h_var.index = h_var.index + '_{}_var'.format(name) return pd.concat([h_max, h_min, h_mean, h_var])
def generate_one_set(station_id: int, forecast_date: pd.Timestamp, df: pd.DataFrame, previous_days=2, predict=False): """ Use forecast date's data and previous date data to concat a set of training data. """ forecast_date = pd.Timestamp(forecast_date) history_list = [] for i in range(previous_days, 0, -1): history_list.append(df.loc[station_id, forecast_date - pd.DateOffset(days=i)].iloc[:24]) history = pd.concat(history_list + [df.loc[station_id, forecast_date]]) history = check_invalid(history) history = history.interpolate(method='linear', limit=8, limit_direction='both') # history = fill_nan_with_m(history) if predict: assert ~history.iloc[:(24 * previous_days + 4)].isnull().values.any( ), 'Empty data found in station {} date {}'.format( station_id, forecast_date) else: assert ~history.isnull().values.any( ), 'Empty data found in station {} date {}'.format( station_id, forecast_date) history = normalize(history) history_obs = history.iloc[:(24 * previous_days + 4)][obs_names] history_m = history[m_names] prediction = history.iloc[(24 * previous_days + 4):][['t2m_obs', 'rh2m_obs', 'w10m_obs']] return history_obs, history_m, prediction
""" problem1_df = df["vidsWatched"] >= 5 problem1_df = df[problem1_df] print(problem1_df.head()) xy = problem1_df.drop(['VidID', 's', 's_rel_avg', 's_tot_avg', 'stdPBR'], axis=1) return xy dft, dfs = readdata("data-sets/behavior-performance.txt") xy = filter(dfs) k = find_k(plot=False) xy2 = xy.to_numpy() xy2, m, s = normalize(xy2[:, 1:]) kmeans = KMeans(n_clusters=k) #number of clusters kmeans.fit(xy2) centers = kmeans.cluster_centers_ figure2 = plt.figure(figsize=(10, 10)) plt.subplots_adjust(bottom=.05, top=0.91, hspace=.5, wspace=.5, left=.01, right=.99) count = 1 graph_set = [] for col in range(centers.shape[-1]): for two in range(centers.shape[-1]):
import fasttext from load_data import normalize PATH = 'example.txt' if __name__ == '__main__': classifier = fasttext.load_model('snapshots/model.bin') with open(PATH, errors='ignore') as f: str = f.read() str = normalize(str) predict = classifier.predict(str, k=3) print(predict)