def homepage(): st.title("Enter Amazon product url to fetch data") url = st.text_input("") if st.button('Fetch Data'): all_reviews_df, product_title = scraper.scraper(url) if all_reviews_df is not None: st.dataframe(all_reviews_df.head()) title = preprocessing.product_name(product_title) all_reviews_df.to_csv(f"./Amazon Reviews/{title}.csv") preprocessing.clean_data(all_reviews_df, title)
def predict_all(X, weights, degs, medians, unwanted): """ Predicts all labels given raw data. This means it also separates, cleans and expands the data. """ predictions = np.ones((X.shape[0], )) # for every jet number for i in range(4): # get test data x, ids = data_of_jet_num(X, None, i) # clean test data clean_x = clean_data(x, medians[i], unwanted[i]) expanded_x = add_features(clean_x, degs[i]) # predict the labels p = predict(weights[i], expanded_x) predictions[ids] = p return predictions
import matplotlib.pyplot as plt # fix random seed for reproducibility from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler np.random.seed(7) # Import import pandas as pd import preprocessing as pr data_site_hourly = pd.read_csv(r"data_site/data_1485990000000.csv", low_memory=False) features_hourly = ["Date", pr.PowerPV,pr.Temperature,pr.Irradiation] data_prepared_hourly = pr.prepare_date(data_site_hourly,features_hourly) data_cleaned_hourly = pr.clean_data(data_prepared_hourly) # data = data.reset_index() # Drop date variable data = data_model.copy() data = data.reset_index() data = data.drop(['Date'], 1) # Dimensions of dataset n = data.shape[0] p = data.shape[1] # Make data a np.array
# Learning hyper-parameters learning_rate = 0.01 n_hidden = 512 total_epoch = 500 n_step = CHUNK # the length of the input sequence # n_input = n_class = v_len # the size of each input n_input = 1 n_class = v_len batch_size = 1 """ Pre-processing """ # Removes unnecessary "NONE" chunks and encodes y values X_train, y_train = clean_data(training_data, v_map, chunk_size=CHUNK) number_of_batches = int(math.ceil(len(X_train) / batch_size)) """ Model """ model = Sequential() model.add( Conv1D(filters=30, kernel_size=30, strides=1, padding="causal", input_shape=(512, 1))) model.add(MaxPool1D(pool_size=2)) model.add(Flatten()) model.add(Reshape((7680, 1)))
def get_pair_plot(df): df = preprocessing.clean_data(df) print("plotting") sns.pairplot(df, hue="cause") plt.show()
def get_joint_plot(df): df = preprocessing.clean_data(df) print("plotting") sns.jointplot(x="age", y="time", data=df) plt.show()
def get_count_plot(df): df = preprocessing.clean_data(df) print("plotting") sns.countplot(x="month", hue="race", data=df) plt.show()
def get_dist_plot(df): df = preprocessing.clean_data(df) print("plotting") sns.distplot(df['time']) plt.show()
for rect in rects: height = rect.get_height() ax.annotate('{}'.format(height), xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom') autolabel(rect) fig.savefig('plots/Ratings-Distribution.png', facecolor='w') fig.tight_layout() plt.show() ############################################################################## reviews = clean_data(reviews) discrete_ratings_distribution = reviews[['Rating']].value_counts() #plot of distribution of ratings labels = discrete_ratings_distribution.index.get_level_values(0).values distribution = discrete_ratings_distribution.values x = np.arange(len(labels)) width = 0.35 fig, ax = plt.subplots(figsize=(12, 6)) rect = ax.bar(x - width / 2, distribution, width, label='Ratings Count', color='#008891')
).apply(lambda x: float(x[0].replace(',', '.'))) check_timeindex_subscribers(subs) data_weather_darksky = prw.get_all_weather_darksky() pr.check_timeindex(data_weather_darksky) subs_BC = subs.drop_duplicates(['NB BC'], keep='last') subs_MC = subs.drop_duplicates(['NB MC'], keep='last') # pr.plot_data(subs_BC,subs_BC.columns,4,0) data_site_hourly = pr.get_all_data_site() # FOR BC features_hourly_BC = ["Date", pr.PowerServiceBC] data_prepared_hourly_BC = pr.prepare_date(data_site_hourly, features_hourly_BC) data_cleaned_hourly_BC = pr.clean_data(data_prepared_hourly_BC, pr.PowerServiceBC, "mean") pr.check_timeindex(data_cleaned_hourly_BC) data_merged = data_cleaned_hourly_BC.join(subs['NB BC'], how='outer') merged_backfill_BC = data_merged.fillna(method='ffill') merged_backfill_BC.dropna(axis=0, how='any', inplace=True) pr.plot_data(merged_backfill_BC, merged_backfill_BC.columns, "Consomaton vs consumers", mode=0) pr.check_timeindex(merged_backfill_BC) #FOR MC features_hourly_MC = ["Date", pr.PowerServiceMC] data_prepared_hourly_MC = pr.prepare_date(data_site_hourly, features_hourly_MC) data_cleaned_hourly_MC = pr.clean_data(data_prepared_hourly_MC, pr.PowerServiceMC, "mean")
loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # load weights into new model model.load_weights("model.h5") print("Loaded model from disk") scaler = joblib.load('scaler_keras_model.save') print("Loaded scaler from disk") #Daily Data data_site_check = pd.read_csv(r"data_site/data_1522997834000.csv", low_memory=False) features_check = ["Date", pr.PowerPV] data_prepared_check = pr.prepare_date(data_site_check, features_check) data_cleaned_check = pr.clean_data(data_prepared_check) data = data_cleaned_check.reset_index() # Drop date variable data = data.drop(['Date'], 1) # Make data a np.array data = data.values data = scaler.transform(data) dataX = [] value_start = data[0:1] obs = value_start for i in range(0, 12): obs_reshaped = pd.np.reshape(obs, (obs.shape[0], 1, obs.shape[1]))