예제 #1
0
def main(show_progress, *args, **kwargs):

    # Create a new fetch index for the records fetched.
    last_fetch_index = Dataset.select(fn.Max(Dataset.fetch_index)).scalar() or 0
    fetch_index = last_fetch_index + 1

    # Set up progress bar
    if show_progress:
        progress_bar = ProgressBar(widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Fetched metadata for ', Counter(), ' datasets.'
        ])
        progress_bar.start()

    # Fetch all pages of datasets
    datasets_fetched = 0
    last_page = False
    while not last_page:

        params = DEFAULT_PARAMS.copy()
        params['start'] = datasets_fetched
        resp = make_request(default_requests_session.get, URL, params=params).json()

        if not resp['success']:
            logging.error("Request to URL %s was unsuccessful", URL)

        result = resp['result']
        num_datasets = len(result['results'])
        datasets_fetched += num_datasets

        if show_progress:
            # We can finally initialize the total number of datasets expected
            # only after we get the first round of results.
            progress_bar.maxval = result['count']
            progress_bar.update(datasets_fetched)

        for dataset in result['results']:

            dataset_record = Dataset.create(
                dataset_id=dataset['id'],
                title=trim_char_data(dataset['title']),
                license_title=trim_char_data(['license_title']),
                fetch_index=fetch_index,
            )

            for resource in dataset['resources']:
                if resource['format'] == DATA_FORMAT:
                    Resource.create(
                        resource_id=resource['id'],
                        dataset=dataset_record,
                        format=resource['format'],
                        url=resource['url'],
                    )

        time.sleep(REQUEST_DELAY)  # enforce a pause between each fetch to be respectful to API
        last_page = datasets_fetched >= result['count']

    if show_progress:
        progress_bar.finish()
예제 #2
0
    def update_model(self):
        """
        Update XGboost model (gbm), using relevant data.
        This function using model Films and Users, please don't change their.
        :return: accuracy of the new model
        """

        # load the list of users
        count = Users.select().count()
        users = []
        for i in range(0, count, 100):
            usrs = Users.select().offset(i).limit(100).execute()
            for u in usrs:
                users.append(model_to_dict(u))

        # collect dataset
        dataset = []
        for i in range(0, count, 200):
            data = Dataset.select().order_by(
                Dataset.id).offset(i).limit(200).execute()
            for d in data:
                dataset.append(model_to_dict(d))
        dataset = self.filtr(dataset)
        dataset = [{
            "data": self.full_data(d["film"], d["user"]),
            "result": d["result"]
        } for d in dataset]

        X = [d["data"] for d in dataset]
        Y = [int(d["result"] > 0) for d in dataset]

        from sklearn.preprocessing import normalize
        X = normalize(X)

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.1,
                                                            random_state=42)

        # learning new model
        if MODEL == "gbm":
            model = xgb.XGBClassifier(
                max_depth=7,
                n_estimators=1600,
                learning_rate=0.01,
                subsample=0.3,
                #                         gamma = 300,
                colsample_bytree=0.3).fit(X_train, y_train)

        else:
            pool = Pool(X_train, y_train)
            model = CatBoostClassifier(iterations=1600,
                                       learning_rate=0.01,
                                       depth=5,
                                       random_seed=7)
            model.fit(pool)

        @not_test
        def save():  # save model
            import pickle
            pickle.dump(model, open(PATH_TO_DIR + "model", "wb"))

        # compute accuracy
        predictions = model.predict_proba(X_test)[:, 1]
        from sklearn.metrics import roc_auc_score
        test = roc_auc_score(predictions > 0.5, y_test)

        return test