def main(show_progress, *args, **kwargs): # Create a new fetch index for the records fetched. last_fetch_index = Dataset.select(fn.Max(Dataset.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 # Set up progress bar if show_progress: progress_bar = ProgressBar(widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Fetched metadata for ', Counter(), ' datasets.' ]) progress_bar.start() # Fetch all pages of datasets datasets_fetched = 0 last_page = False while not last_page: params = DEFAULT_PARAMS.copy() params['start'] = datasets_fetched resp = make_request(default_requests_session.get, URL, params=params).json() if not resp['success']: logging.error("Request to URL %s was unsuccessful", URL) result = resp['result'] num_datasets = len(result['results']) datasets_fetched += num_datasets if show_progress: # We can finally initialize the total number of datasets expected # only after we get the first round of results. progress_bar.maxval = result['count'] progress_bar.update(datasets_fetched) for dataset in result['results']: dataset_record = Dataset.create( dataset_id=dataset['id'], title=trim_char_data(dataset['title']), license_title=trim_char_data(['license_title']), fetch_index=fetch_index, ) for resource in dataset['resources']: if resource['format'] == DATA_FORMAT: Resource.create( resource_id=resource['id'], dataset=dataset_record, format=resource['format'], url=resource['url'], ) time.sleep(REQUEST_DELAY) # enforce a pause between each fetch to be respectful to API last_page = datasets_fetched >= result['count'] if show_progress: progress_bar.finish()
def update_model(self): """ Update XGboost model (gbm), using relevant data. This function using model Films and Users, please don't change their. :return: accuracy of the new model """ # load the list of users count = Users.select().count() users = [] for i in range(0, count, 100): usrs = Users.select().offset(i).limit(100).execute() for u in usrs: users.append(model_to_dict(u)) # collect dataset dataset = [] for i in range(0, count, 200): data = Dataset.select().order_by( Dataset.id).offset(i).limit(200).execute() for d in data: dataset.append(model_to_dict(d)) dataset = self.filtr(dataset) dataset = [{ "data": self.full_data(d["film"], d["user"]), "result": d["result"] } for d in dataset] X = [d["data"] for d in dataset] Y = [int(d["result"] > 0) for d in dataset] from sklearn.preprocessing import normalize X = normalize(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) # learning new model if MODEL == "gbm": model = xgb.XGBClassifier( max_depth=7, n_estimators=1600, learning_rate=0.01, subsample=0.3, # gamma = 300, colsample_bytree=0.3).fit(X_train, y_train) else: pool = Pool(X_train, y_train) model = CatBoostClassifier(iterations=1600, learning_rate=0.01, depth=5, random_seed=7) model.fit(pool) @not_test def save(): # save model import pickle pickle.dump(model, open(PATH_TO_DIR + "model", "wb")) # compute accuracy predictions = model.predict_proba(X_test)[:, 1] from sklearn.metrics import roc_auc_score test = roc_auc_score(predictions > 0.5, y_test) return test