def one_run(projects_train, projects_test, K, outlier_threshold, granularity):
    rmse_failed_run = []
    rmse_success_run = []
    rmse_run = []
    accuracy_run = []
    relative_time = np.linspace(0.025, 1, 20)
    bar = ProgressBar(end_value=len(relative_time), text="Time steps", count=True)
    bar.start()
    for i, rel_t in enumerate(relative_time):
        # Data
        t = int(rel_t * 999)
        samples = subsample(t, granularity)
        t = len(samples)
        T = 999

        # Remove outliers
        projects_train_filtered = [p for p in projects_train if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)]
        projects_test_filtered = [p for p in projects_test if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)]

        X_train = np.ndarray(shape=(len(projects_train_filtered), t), buffer=np.array([p.money[samples] for p in projects_train_filtered]), dtype=float)
        y_train = np.expand_dims(np.array([p.money[T] for p in projects_train_filtered]), axis=1)
        X_test = np.ndarray(shape=(len(projects_test_filtered), t), buffer=np.array([p.money[samples] for p in projects_test_filtered]), dtype=float)
        y_test = np.expand_dims(np.array([p.money[T] for p in projects_test_filtered]), axis=1)

        #X_max = np.max(X_train, axis=0)
        #X_train = X_train / X_max[np.newaxis, :]
        #X_test = X_test / X_max[np.newaxis, :]

        # Hyperparameters
        beta = 0.0001
        epsilon = 1e0
        lam = 0
        iterations = 50
        random_restarts = None

        mls = LeastSquaresMixture(X_train, y_train,
                                  K=K, beta=beta, lam=lam,
                                  iterations=iterations, epsilon=epsilon, random_restarts=random_restarts)
        mls.train(verbose=False)
        #print(mls)

        rmse_failed, rmse_success, rmse, accuracy = mls.evaluate(X_test, y_test, verbose=False)
        rmse_failed_run.append(rmse_failed)
        rmse_success_run.append(rmse_success)
        rmse_run.append(rmse)
        accuracy_run.append(accuracy)

        bar.update(i)

    print(accuracy_run)

    return rmse_failed_run, rmse_success_run, rmse_run, accuracy_run
Exemplo n.º 2
0
def one_run(projects_train, projects_test, outlier_threshold):
    rmse_run = []
    accuracy_run = []
    relative_time = np.linspace(0.025, 1, 20)
    bar = ProgressBar(end_value=len(relative_time), text="Time steps", count=True)
    bar.start()
    for i, rel_t in enumerate(relative_time):
        # n_samples = 1
        # t0 = 1
        # t1 = 500
        # samples = subsample(t0, t1, n_samples)
        samples = rel_t * 1000 - 1
        t = 1
        T = 999
        ARD = False

        projects_train = [p for p in projects_train if
                          p.money[T] * p.goal < outlier_threshold and p.money[samples] * p.goal < outlier_threshold]
        projects_test = [p for p in projects_test if
                         p.money[T] * p.goal < outlier_threshold and p.money[samples] * p.goal < outlier_threshold]

        X_train = np.ndarray(shape=(len(projects_train), t),
                             buffer=np.array([p.money[samples] * p.goal for p in projects_train]), dtype=float)
        y_train = np.expand_dims(np.array([p.money[T] * p.goal for p in projects_train]), axis=1)
        X_test = np.ndarray(shape=(len(projects_test), t),
                            buffer=np.array([p.money[samples] * p.goal for p in projects_test]), dtype=float)
        y_test = np.expand_dims(np.array([p.money[T] * p.goal for p in projects_test]), axis=1)

        kernel = GPy.kern.RBF(input_dim=t, ARD=ARD)
        m = GPy.models.GPRegression(X_train, y_train, kernel)
        m.optimize()

        rmse, accuracy = evaluate(X_test, y_test, projects_test, m)
        rmse_run.append(rmse)
        accuracy_run.append(accuracy)

        bar.update(i)

    return rmse_run, accuracy_run
class MongoDBConverter:
    def __init__(self):
        self.db = MongoClient(
            Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE]
        self.progess_bar = ProgressBar()

    def create_review_db(self):
        print_header("Creating Reviews")
        done = 0
        dataset_file = Settings.REVIEW_DATASET_FILE
        business_collection = self.db[Settings.BUSINESS_COLLECTION]

        # Find all the businesses and their reviews and add the review_id to a dict.
        review_id_hashes = set()

        businesses = business_collection.find()
        for business in businesses:
            if 'reviews' in business:
                for review in business['reviews']:
                    review_id_hashes.add(review['review_id'])

        self.progess_bar.start()
        with open(dataset_file, 'r') as dataset:
            count = sum(1 for _ in dataset)

        with open(dataset_file, 'r') as dataset:
            next(dataset)
            for line in dataset:
                try:
                    data = json.loads(line, encoding='utf-8')
                except ValueError:
                    print('Oops!')

                # Insert into DB
                if data["type"] == "review":
                    business_id = data['business_id']
                    business = business_collection.find_one(
                        {'business_id': business_id})

                    assert (business is not None)

                    add_review = True
                    if data['review_id'] in review_id_hashes:
                        add_review = False

                    business['reviews'] = business.get('reviews', [])
                    if add_review:
                        business['reviews'].append(data)
                        business_collection.update_one(
                            {'business_id': business_id}, {"$set": business},
                            upsert=True)

                done += 1
                self.progess_bar.print_progress(done, count)

    def add_business_data_collection(self):
        print_header("Adding Business Data")

        dataset_file = Settings.BUSINESS_DATASET_FILE

        add_businesses = True

        if Settings.BUSINESS_COLLECTION in self.db.collection_names():
            business_collection = self.db[Settings.BUSINESS_COLLECTION]
            if business_collection.count() > 0:
                add_businesses = False
                print("Data already present.... Skipping")

        if add_businesses:
            self.progess_bar.start()
            with open(dataset_file, 'r') as dataset:
                count = sum(1 for _ in dataset)

            business_collection = self.db[Settings.BUSINESS_COLLECTION]
            with open(dataset_file, 'r') as dataset:
                done = 0
                for line in dataset:
                    try:
                        data = json.loads(line, encoding='utf-8')
                    except ValueError:
                        print("Error in Business json file")

                    # Insert into DB
                    assert (data['type'] == 'business')
                    business_collection.insert(data)

                    done += 1
                    self.progess_bar.print_progress(done,
                                                    count,
                                                    prefix='Progress:',
                                                    suffix='Complete')

            business_collection.create_index('business_id')

    def add_user_data_collection(self):
        print_header("Adding User Data")
        dataset_file = Settings.USER_DATASET_FILE

        add_users = True

        if Settings.USER_COLLECTION in self.db.collection_names():
            user_collection = self.db[Settings.USER_COLLECTION]
            if user_collection.count() > 0:
                add_users = False
                print("Data already present.... Skipping")

        if add_users:
            self.progess_bar.start()
            with open(dataset_file, 'r') as dataset:
                count = sum(1 for _ in dataset)

            user_collection = self.db[Settings.USER_COLLECTION]
            with open(dataset_file, 'r') as dataset:
                done = 0
                for line in dataset:
                    try:
                        data = json.loads(line, encoding='utf-8')
                    except ValueError:
                        print("Error in Business json file")

                    # Insert into DB
                    assert (data['type'] == 'user')
                    user_collection.insert(data)

                    done += 1
                    self.progess_bar.print_progress(done,
                                                    count,
                                                    prefix='Progress:',
                                                    suffix='Complete')

            user_collection.create_index('user_id')
Exemplo n.º 4
0
def _one_run(projects_train, projects_test, relative_time, features,
             outlier_threshold, normalized, granularity):
    """
    Run the experiment once for an increasing time for some given parameters.

    :param projects_train:      Training projects set
    :param projects_test:       Test projects set
    :param realtive_time:       Relative time, used as x axis
    :param features:            Features to extract from money time series
    :param outlier_threshold:   Threshold to discard outliers
    :param normalized:          Whether to use normalized money
    :param granularity:         Level of granularity
    :return:                    RMSE and accuracy fot this experiment
    """
    rmse_run = []
    accuracy_run = []
    bar = ProgressBar(end_value=len(relative_time),
                      text="Time steps",
                      count=True)
    bar.start()

    # Remove outliers
    projects_train_filtered = [
        p for p in projects_train
        if np.all([(m - outlier_threshold) <= 0 for m in p.money])
    ]
    projects_test_filtered = [
        p for p in projects_test
        if np.all([(m - outlier_threshold) <= 0 for m in p.money])
    ]

    for i, rel_t in enumerate(relative_time):
        # Data
        t = int(rel_t * 999)
        samples = subsample(t, granularity)
        n_samples = 1
        T = 999

        X_train = np.ndarray(shape=(len(projects_train_filtered), n_samples),
                             buffer=np.array([
                                 features(p.money[samples])
                                 for p in projects_train_filtered
                             ]),
                             dtype=float)
        y_train = np.expand_dims(np.array(
            [p.money[T] for p in projects_train_filtered]),
                                 axis=1)
        X_test = np.ndarray(shape=(len(projects_test_filtered), n_samples),
                            buffer=np.array([
                                features(p.money[samples])
                                for p in projects_test_filtered
                            ]),
                            dtype=float)
        y_test = np.expand_dims(np.array(
            [p.money[T] for p in projects_test_filtered]),
                                axis=1)

        m = linear_model.LinearRegression()
        m.fit(X_train, y_train)

        rmse, accuracy = _evaluate(X_test, y_test, projects_test, m,
                                   normalized)
        rmse_run.append(rmse)
        accuracy_run.append(accuracy)

        bar.update(i)

    return rmse_run, accuracy_run
def one_run(projects_train, projects_test, K, outlier_threshold, granularity):
    rmse_failed_run = []
    rmse_success_run = []
    rmse_run = []
    accuracy_run = []
    relative_time = np.linspace(0.025, 1, 20)
    bar = ProgressBar(end_value=len(relative_time),
                      text="Time steps",
                      count=True)
    bar.start()
    for i, rel_t in enumerate(relative_time):
        # Data
        t = int(rel_t * 999)
        samples = subsample(t, granularity)
        t = len(samples)
        T = 999

        # Remove outliers
        projects_train_filtered = [
            p for p in projects_train
            if np.all((p.money[T] - outlier_threshold) <= 0) and np.all(
                (p.money[samples] - outlier_threshold) <= 0)
        ]
        projects_test_filtered = [
            p for p in projects_test
            if np.all((p.money[T] - outlier_threshold) <= 0) and np.all(
                (p.money[samples] - outlier_threshold) <= 0)
        ]

        X_train = np.ndarray(shape=(len(projects_train_filtered), t),
                             buffer=np.array([
                                 p.money[samples]
                                 for p in projects_train_filtered
                             ]),
                             dtype=float)
        y_train = np.expand_dims(np.array(
            [p.money[T] for p in projects_train_filtered]),
                                 axis=1)
        X_test = np.ndarray(
            shape=(len(projects_test_filtered), t),
            buffer=np.array([p.money[samples]
                             for p in projects_test_filtered]),
            dtype=float)
        y_test = np.expand_dims(np.array(
            [p.money[T] for p in projects_test_filtered]),
                                axis=1)

        #X_max = np.max(X_train, axis=0)
        #X_train = X_train / X_max[np.newaxis, :]
        #X_test = X_test / X_max[np.newaxis, :]

        # Hyperparameters
        beta = 0.0001
        epsilon = 1e0
        lam = 0
        iterations = 50
        random_restarts = None

        mls = LeastSquaresMixture(X_train,
                                  y_train,
                                  K=K,
                                  beta=beta,
                                  lam=lam,
                                  iterations=iterations,
                                  epsilon=epsilon,
                                  random_restarts=random_restarts)
        mls.train(verbose=False)
        #print(mls)

        rmse_failed, rmse_success, rmse, accuracy = mls.evaluate(X_test,
                                                                 y_test,
                                                                 verbose=False)
        rmse_failed_run.append(rmse_failed)
        rmse_success_run.append(rmse_success)
        rmse_run.append(rmse)
        accuracy_run.append(accuracy)

        bar.update(i)

    print(accuracy_run)

    return rmse_failed_run, rmse_success_run, rmse_run, accuracy_run