def one_run(projects_train, projects_test, K, outlier_threshold, granularity): rmse_failed_run = [] rmse_success_run = [] rmse_run = [] accuracy_run = [] relative_time = np.linspace(0.025, 1, 20) bar = ProgressBar(end_value=len(relative_time), text="Time steps", count=True) bar.start() for i, rel_t in enumerate(relative_time): # Data t = int(rel_t * 999) samples = subsample(t, granularity) t = len(samples) T = 999 # Remove outliers projects_train_filtered = [p for p in projects_train if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)] projects_test_filtered = [p for p in projects_test if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)] X_train = np.ndarray(shape=(len(projects_train_filtered), t), buffer=np.array([p.money[samples] for p in projects_train_filtered]), dtype=float) y_train = np.expand_dims(np.array([p.money[T] for p in projects_train_filtered]), axis=1) X_test = np.ndarray(shape=(len(projects_test_filtered), t), buffer=np.array([p.money[samples] for p in projects_test_filtered]), dtype=float) y_test = np.expand_dims(np.array([p.money[T] for p in projects_test_filtered]), axis=1) #X_max = np.max(X_train, axis=0) #X_train = X_train / X_max[np.newaxis, :] #X_test = X_test / X_max[np.newaxis, :] # Hyperparameters beta = 0.0001 epsilon = 1e0 lam = 0 iterations = 50 random_restarts = None mls = LeastSquaresMixture(X_train, y_train, K=K, beta=beta, lam=lam, iterations=iterations, epsilon=epsilon, random_restarts=random_restarts) mls.train(verbose=False) #print(mls) rmse_failed, rmse_success, rmse, accuracy = mls.evaluate(X_test, y_test, verbose=False) rmse_failed_run.append(rmse_failed) rmse_success_run.append(rmse_success) rmse_run.append(rmse) accuracy_run.append(accuracy) bar.update(i) print(accuracy_run) return rmse_failed_run, rmse_success_run, rmse_run, accuracy_run
def one_run(projects_train, projects_test, outlier_threshold): rmse_run = [] accuracy_run = [] relative_time = np.linspace(0.025, 1, 20) bar = ProgressBar(end_value=len(relative_time), text="Time steps", count=True) bar.start() for i, rel_t in enumerate(relative_time): # n_samples = 1 # t0 = 1 # t1 = 500 # samples = subsample(t0, t1, n_samples) samples = rel_t * 1000 - 1 t = 1 T = 999 ARD = False projects_train = [p for p in projects_train if p.money[T] * p.goal < outlier_threshold and p.money[samples] * p.goal < outlier_threshold] projects_test = [p for p in projects_test if p.money[T] * p.goal < outlier_threshold and p.money[samples] * p.goal < outlier_threshold] X_train = np.ndarray(shape=(len(projects_train), t), buffer=np.array([p.money[samples] * p.goal for p in projects_train]), dtype=float) y_train = np.expand_dims(np.array([p.money[T] * p.goal for p in projects_train]), axis=1) X_test = np.ndarray(shape=(len(projects_test), t), buffer=np.array([p.money[samples] * p.goal for p in projects_test]), dtype=float) y_test = np.expand_dims(np.array([p.money[T] * p.goal for p in projects_test]), axis=1) kernel = GPy.kern.RBF(input_dim=t, ARD=ARD) m = GPy.models.GPRegression(X_train, y_train, kernel) m.optimize() rmse, accuracy = evaluate(X_test, y_test, projects_test, m) rmse_run.append(rmse) accuracy_run.append(accuracy) bar.update(i) return rmse_run, accuracy_run
class MongoDBConverter: def __init__(self): self.db = MongoClient( Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE] self.progess_bar = ProgressBar() def create_review_db(self): print_header("Creating Reviews") done = 0 dataset_file = Settings.REVIEW_DATASET_FILE business_collection = self.db[Settings.BUSINESS_COLLECTION] # Find all the businesses and their reviews and add the review_id to a dict. review_id_hashes = set() businesses = business_collection.find() for business in businesses: if 'reviews' in business: for review in business['reviews']: review_id_hashes.add(review['review_id']) self.progess_bar.start() with open(dataset_file, 'r') as dataset: count = sum(1 for _ in dataset) with open(dataset_file, 'r') as dataset: next(dataset) for line in dataset: try: data = json.loads(line, encoding='utf-8') except ValueError: print('Oops!') # Insert into DB if data["type"] == "review": business_id = data['business_id'] business = business_collection.find_one( {'business_id': business_id}) assert (business is not None) add_review = True if data['review_id'] in review_id_hashes: add_review = False business['reviews'] = business.get('reviews', []) if add_review: business['reviews'].append(data) business_collection.update_one( {'business_id': business_id}, {"$set": business}, upsert=True) done += 1 self.progess_bar.print_progress(done, count) def add_business_data_collection(self): print_header("Adding Business Data") dataset_file = Settings.BUSINESS_DATASET_FILE add_businesses = True if Settings.BUSINESS_COLLECTION in self.db.collection_names(): business_collection = self.db[Settings.BUSINESS_COLLECTION] if business_collection.count() > 0: add_businesses = False print("Data already present.... Skipping") if add_businesses: self.progess_bar.start() with open(dataset_file, 'r') as dataset: count = sum(1 for _ in dataset) business_collection = self.db[Settings.BUSINESS_COLLECTION] with open(dataset_file, 'r') as dataset: done = 0 for line in dataset: try: data = json.loads(line, encoding='utf-8') except ValueError: print("Error in Business json file") # Insert into DB assert (data['type'] == 'business') business_collection.insert(data) done += 1 self.progess_bar.print_progress(done, count, prefix='Progress:', suffix='Complete') business_collection.create_index('business_id') def add_user_data_collection(self): print_header("Adding User Data") dataset_file = Settings.USER_DATASET_FILE add_users = True if Settings.USER_COLLECTION in self.db.collection_names(): user_collection = self.db[Settings.USER_COLLECTION] if user_collection.count() > 0: add_users = False print("Data already present.... Skipping") if add_users: self.progess_bar.start() with open(dataset_file, 'r') as dataset: count = sum(1 for _ in dataset) user_collection = self.db[Settings.USER_COLLECTION] with open(dataset_file, 'r') as dataset: done = 0 for line in dataset: try: data = json.loads(line, encoding='utf-8') except ValueError: print("Error in Business json file") # Insert into DB assert (data['type'] == 'user') user_collection.insert(data) done += 1 self.progess_bar.print_progress(done, count, prefix='Progress:', suffix='Complete') user_collection.create_index('user_id')
def _one_run(projects_train, projects_test, relative_time, features, outlier_threshold, normalized, granularity): """ Run the experiment once for an increasing time for some given parameters. :param projects_train: Training projects set :param projects_test: Test projects set :param realtive_time: Relative time, used as x axis :param features: Features to extract from money time series :param outlier_threshold: Threshold to discard outliers :param normalized: Whether to use normalized money :param granularity: Level of granularity :return: RMSE and accuracy fot this experiment """ rmse_run = [] accuracy_run = [] bar = ProgressBar(end_value=len(relative_time), text="Time steps", count=True) bar.start() # Remove outliers projects_train_filtered = [ p for p in projects_train if np.all([(m - outlier_threshold) <= 0 for m in p.money]) ] projects_test_filtered = [ p for p in projects_test if np.all([(m - outlier_threshold) <= 0 for m in p.money]) ] for i, rel_t in enumerate(relative_time): # Data t = int(rel_t * 999) samples = subsample(t, granularity) n_samples = 1 T = 999 X_train = np.ndarray(shape=(len(projects_train_filtered), n_samples), buffer=np.array([ features(p.money[samples]) for p in projects_train_filtered ]), dtype=float) y_train = np.expand_dims(np.array( [p.money[T] for p in projects_train_filtered]), axis=1) X_test = np.ndarray(shape=(len(projects_test_filtered), n_samples), buffer=np.array([ features(p.money[samples]) for p in projects_test_filtered ]), dtype=float) y_test = np.expand_dims(np.array( [p.money[T] for p in projects_test_filtered]), axis=1) m = linear_model.LinearRegression() m.fit(X_train, y_train) rmse, accuracy = _evaluate(X_test, y_test, projects_test, m, normalized) rmse_run.append(rmse) accuracy_run.append(accuracy) bar.update(i) return rmse_run, accuracy_run
def one_run(projects_train, projects_test, K, outlier_threshold, granularity): rmse_failed_run = [] rmse_success_run = [] rmse_run = [] accuracy_run = [] relative_time = np.linspace(0.025, 1, 20) bar = ProgressBar(end_value=len(relative_time), text="Time steps", count=True) bar.start() for i, rel_t in enumerate(relative_time): # Data t = int(rel_t * 999) samples = subsample(t, granularity) t = len(samples) T = 999 # Remove outliers projects_train_filtered = [ p for p in projects_train if np.all((p.money[T] - outlier_threshold) <= 0) and np.all( (p.money[samples] - outlier_threshold) <= 0) ] projects_test_filtered = [ p for p in projects_test if np.all((p.money[T] - outlier_threshold) <= 0) and np.all( (p.money[samples] - outlier_threshold) <= 0) ] X_train = np.ndarray(shape=(len(projects_train_filtered), t), buffer=np.array([ p.money[samples] for p in projects_train_filtered ]), dtype=float) y_train = np.expand_dims(np.array( [p.money[T] for p in projects_train_filtered]), axis=1) X_test = np.ndarray( shape=(len(projects_test_filtered), t), buffer=np.array([p.money[samples] for p in projects_test_filtered]), dtype=float) y_test = np.expand_dims(np.array( [p.money[T] for p in projects_test_filtered]), axis=1) #X_max = np.max(X_train, axis=0) #X_train = X_train / X_max[np.newaxis, :] #X_test = X_test / X_max[np.newaxis, :] # Hyperparameters beta = 0.0001 epsilon = 1e0 lam = 0 iterations = 50 random_restarts = None mls = LeastSquaresMixture(X_train, y_train, K=K, beta=beta, lam=lam, iterations=iterations, epsilon=epsilon, random_restarts=random_restarts) mls.train(verbose=False) #print(mls) rmse_failed, rmse_success, rmse, accuracy = mls.evaluate(X_test, y_test, verbose=False) rmse_failed_run.append(rmse_failed) rmse_success_run.append(rmse_success) rmse_run.append(rmse) accuracy_run.append(accuracy) bar.update(i) print(accuracy_run) return rmse_failed_run, rmse_success_run, rmse_run, accuracy_run