def main(train_model_window=None): ti = time.monotonic() # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise current_app.logger.info('Fetching listens to create dataframes...') to_date, from_date = get_dates_to_train_data(train_model_window) partial_listens_df = get_listens_for_training_model_window( to_date, from_date, metadata, path.LISTENBRAINZ_DATA_DIRECTORY) current_app.logger.info( 'Listen count from {from_date} to {to_date}: {listens_count}'.format( from_date=from_date, to_date=to_date, listens_count=partial_listens_df.count())) current_app.logger.info('Loading mapping from HDFS...') df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING) msid_mbid_mapping_df = get_unique_rows_from_mapping(df) current_app.logger.info( 'Number of distinct rows in the mapping: {}'.format( msid_mbid_mapping_df.count())) current_app.logger.info('Mapping listens...') mapped_listens_df = get_mapped_artist_and_recording_mbids( partial_listens_df, msid_mbid_mapping_df) current_app.logger.info('Listen count after mapping: {}'.format( mapped_listens_df.count())) current_app.logger.info('Preparing users data and saving to HDFS...') users_df = get_users_dataframe(mapped_listens_df, metadata) current_app.logger.info('Preparing recordings data and saving to HDFS...') recordings_df = get_recordings_df(mapped_listens_df, metadata) current_app.logger.info( 'Preparing listen data dump and playcounts, saving playcounts to HDFS...' ) listens_df = get_listens_df(mapped_listens_df, metadata) save_playcounts_df(listens_df, recordings_df, users_df, metadata) generate_dataframe_id(metadata) save_dataframe_metadata_to_hdfs(metadata) current_app.logger.info('Preparing missing MusicBrainz data...') missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz( partial_listens_df, msid_mbid_mapping_df) messages = prepare_messages(missing_musicbrainz_data_itr, from_date, to_date, ti) return messages
def __init__(self): hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) try: listenbrainz_spark.init_spark_session('uploader') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) sys.exit(-1)
def init_dir(rm, recursive, create_dir): """ Create directories in HDFS to run the recommendation engine. """ try: listenbrainz_spark.init_spark_session('Manage Directories') except Py4JJavaError as err: logging.error('{}\n{}\nAborting...'.format(str(err), err.java_exception)) sys.exit(-1) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) if rm: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR) utils.delete_dir(path.CHECKPOINT_DIR) logging.info('Successfully deleted directories.') except HdfsError as err: logging.error( '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.' .format(type(err).__name__)) logging.warning( 'Deleting directory recursively will delete all the recommendation data.' ) sys.exit(-1) if recursive: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True) utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) logging.info('Successfully deleted directories recursively.') except HdfsError as err: logging.error( '{}: An error occurred while deleting directories recursively.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1) if create_dir: try: logging.info('Creating directory to store dataframes...') utils.create_dir(path.DATAFRAME_DIR) logging.info('Creating directory to store models...') utils.create_dir(path.MODEL_DIR) logging.info('Creating directory to store candidate sets...') utils.create_dir(path.CANDIDATE_SET_DIR) logging.info('Creating directory to store RDD checkpoints...') utils.create_dir(path.CHECKPOINT_DIR) print('Done!') except HdfsError as err: logging.error( '{}: An error occured while creating some/more directories.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1)
def main(): listenbrainz_spark.init_spark_session('artist_popularity') mlhd_df_path = config.HDFS_CLUSTER_URI + os.path.join( MLHD_DATA_PATH, '*.avro') try: print('Loading MLHD Dataframe...') mlhd_df = listenbrainz_spark.sql_context.read.format('avro').load( mlhd_df_path) print("Loaded!") except AnalysisException as e: logger.critical("Error while reading MLHD avro files: %s", str(e)) raise print("Number of rows: %d" % mlhd_df.count()) try: mlhd_df.registerTempTable('mlhd') except AnalysisException as e: logger.critical("Error while registering dataframe mlhd: %s", str(e)) raise for _ in range(5): try: print("Running SQL...") artist_popularity_df = listenbrainz_spark.sql_context.sql(""" SELECT artist_mbid, COUNT(artist_mbid) as cnt FROM mlhd GROUP BY artist_mbid ORDER BY cnt DESC """) break except Py4JJavaError as e: logger.error("error while running the query: %s", str(e)) else: logger.critical("Could not run query. Exiting...") sys.exit(-1) print("number of rows: ", artist_popularity_df.count()) artist_popularity_df.show() print("Saving...") file_name = 'mlhd-artist-popularity-%s.csv' % datetime.now.strftime( '%Y%m%d-%H%M%S') csv_path = config.HDFS_CLUSTER_URI + os.path.join(MLHD_DATA_PATH, 'csv', file_name) for _ in range(10): try: artist_popularity_df.write.csv(csv_path) break except Exception as e: logger.error( "Couldn't write result to CSV, trying again, error: %s", str(e)) else: logger.critical("Could not write results to HDFS, exiting...") sys.exit(-1) print("Saved to %s!" % csv_path)
def main(): ti = time() time_ = defaultdict(dict) try: listenbrainz_spark.init_spark_session('Recommendations') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) try: users_df = utils.read_files_from_HDFS(path.USERS_DATAFRAME_PATH) recordings_df = utils.read_files_from_HDFS(path.RECORDINGS_DATAFRAME_PATH) top_artists_candidate_set = utils.read_files_from_HDFS(path.TOP_ARTIST_CANDIDATE_SET) similar_artists_candidate_set = utils.read_files_from_HDFS(path.SIMILAR_ARTIST_CANDIDATE_SET) mapped_listens = utils.read_files_from_HDFS(path.MAPPED_LISTENS) except PathNotFoundException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) metadata_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(metadata_file_path, 'r') as f: recommendation_metadata = json.load(f) best_model_id = recommendation_metadata['best_model_id'] user_names = recommendation_metadata['user_name'] best_model_path = path.DATA_DIR + '/' + best_model_id current_app.logger.info('Loading model...') t0 = time() try: model = load_model(config.HDFS_CLUSTER_URI + best_model_path) except Py4JJavaError as err: current_app.logger.error('Unable to load model "{}"\n{}\nAborting...'.format(best_model_id, str(err.java_exception)), exc_info=True) sys.exit(-1) time_['load_model'] = '{:.2f}'.format((time() - t0) / 60) # an action must be called to persist data in memory recordings_df.count() recordings_df.persist() t0 = time() recommendations = get_recommendations(user_names, recordings_df, model, users_df, top_artists_candidate_set, similar_artists_candidate_set, mapped_listens) time_['total_recommendation_time'] = '{:.2f}'.format((time() - t0) / 3600) # persisted data must be cleared from memory after usage to avoid OOM recordings_df.unpersist() if SAVE_RECOMMENDATION_HTML: get_recommendation_html(recommendations, time_, best_model_id, ti)
def main(): ti = time() # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) # Dataframe containing all columns except artist_mbids and recording_mbid to_date, from_date = get_dates_to_train_data() partial_listens_df = get_listens_for_training_model_window( to_date, from_date, metadata, path.LISTENBRAINZ_DATA_DIRECTORY) # Dataframe containing recording msid->mbid and artist msid->mbid mapping. recording_artist_mapping_df = utils.read_files_from_HDFS( path.MBID_MSID_MAPPING) # Dataframe containing all fields that a listen should have including artist_mbids and recording_msid. complete_listens_df = get_mapped_artist_and_recording_mbids( partial_listens_df, recording_artist_mapping_df) current_app.logger.info('Preparing users data and saving to HDFS...') t0 = time() users_df = get_users_dataframe(complete_listens_df, metadata) users_df_time = '{:.2f}'.format((time() - t0) / 60) current_app.logger.info('Preparing recordings data and saving to HDFS...') t0 = time() recordings_df = get_recordings_df(complete_listens_df, metadata) recordings_df_time = '{:.2f}'.format((time() - t0) / 60) current_app.logger.info( 'Preparing listen data dump and playcounts, saving playcounts to HDFS...' ) t0 = time() listens_df = get_listens_df(complete_listens_df, metadata) playcounts_df = get_playcounts_df(listens_df, recordings_df, users_df, metadata) playcounts_df_time = '{:.2f}'.format((time() - t0) / 60) total_time = '{:.2f}'.format((time() - ti) / 60) generate_best_model_id(metadata) save_dataframe_metadata_to_HDFS(metadata) if SAVE_DATAFRAME_HTML: save_dataframe_html(users_df_time, recordings_df_time, playcounts_df_time, total_time)
def main(train_model_window=None): ti = time.monotonic() # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise to_date, from_date = get_dates_to_train_data(train_model_window) partial_listens_df = get_listens_for_training_model_window( to_date, from_date, metadata, path.LISTENBRAINZ_DATA_DIRECTORY) # Dataframe containing recording msid->mbid and artist msid->mbid mapping. msid_mbid_mapping_df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING) mapped_listens_df = get_mapped_artist_and_recording_mbids( partial_listens_df, msid_mbid_mapping_df) current_app.logger.info('Preparing users data and saving to HDFS...') users_df = get_users_dataframe(mapped_listens_df, metadata) current_app.logger.info('Preparing recordings data and saving to HDFS...') recordings_df = get_recordings_df(mapped_listens_df, metadata) current_app.logger.info( 'Preparing listen data dump and playcounts, saving playcounts to HDFS...' ) listens_df = get_listens_df(mapped_listens_df, metadata) playcounts_df = get_playcounts_df(listens_df, recordings_df, users_df, metadata) generate_dataframe_id(metadata) save_dataframe_metadata_to_hdfs(metadata) total_time = '{:.2f}'.format((time.monotonic() - ti) / 60) message = [{ 'type': 'cf_recording_dataframes', 'dataframe_upload_time': str(datetime.utcnow()), 'total_time': total_time, 'from_date': str(from_date.strftime('%b %Y')), 'to_date': str(to_date.strftime('%b %Y')), }] return message
def main(max_num_users: int): logger.info('Start generating similar user matrix') try: listenbrainz_spark.init_spark_session('User Similarity') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) raise try: playcounts_df = utils.read_files_from_HDFS( path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME) users_df = utils.read_files_from_HDFS( path.USER_SIMILARITY_USERS_DATAFRAME) except PathNotFoundException as err: logger.error(str(err), exc_info=True) raise except FileNotFetchedException as err: logger.error(str(err), exc_info=True) raise vectors_df = get_vectors_df(playcounts_df) similarity_matrix = Correlation.corr( vectors_df, 'vector', 'pearson').first()['pearson(vector)'].toArray() similar_users = threshold_similar_users(similarity_matrix, max_num_users) # Due to an unresolved bug in Spark (https://issues.apache.org/jira/browse/SPARK-10925), we cannot join twice on # the same dataframe. Hence, we create a modified dataframe with the columns renamed. other_users_df = users_df\ .withColumnRenamed('user_id', 'other_user_id')\ .withColumnRenamed('user_name', 'other_user_name') similar_users_df = listenbrainz_spark.session.createDataFrame(similar_users, ['user_id', 'other_user_id', 'similarity', 'global_similarity'])\ .join(users_df, 'user_id', 'inner')\ .join(other_users_df, 'other_user_id', 'inner')\ .select('user_name', struct('other_user_name', 'similarity', 'global_similarity').alias('similar_user'))\ .groupBy('user_name')\ .agg(collect_list('similar_user').alias('similar_users')) logger.info('Finishing generating similar user matrix') return create_messages(similar_users_df)
def main(recommendation_top_artist_limit=None, recommendation_similar_artist_limit=None, users=None): try: listenbrainz_spark.init_spark_session('Recommendations') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise try: recordings_df = utils.read_files_from_HDFS( path.RECORDINGS_DATAFRAME_PATH) top_artist_candidate_set_df = utils.read_files_from_HDFS( path.TOP_ARTIST_CANDIDATE_SET) similar_artist_candidate_set_df = utils.read_files_from_HDFS( path.SIMILAR_ARTIST_CANDIDATE_SET) except PathNotFoundException as err: current_app.logger.error(str(err), exc_info=True) raise except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) raise current_app.logger.info('Loading model...') model = load_model() # an action must be called to persist data in memory recordings_df.count() recordings_df.persist() params = RecommendationParams(recordings_df, model, top_artist_candidate_set_df, similar_artist_candidate_set_df, recommendation_top_artist_limit, recommendation_similar_artist_limit) messages = get_recommendations_for_all(params, users) # persisted data must be cleared from memory after usage to avoid OOM recordings_df.unpersist() return messages
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Create_Dataframe') except AttributeError as err: logging.error( 'Cannot initialize Spark Session: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while initializing Spark session: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) df = None missing_parquets = [] for y in range(config.STARTING_YEAR, config.ENDING_YEAR + 1): for m in range(config.STARTING_MONTH, config.ENDING_MONTH + 1): try: month = listenbrainz_spark.sql_context.read.parquet( '{}/data/listenbrainz/{}/{}.parquet'.format( config.HDFS_CLUSTER_URI, y, m)) df = df.union(month) if df else month except AnalysisException as err: missing_parquets.append('{}-{}'.format(y, '{:02d}'.format(m))) logging.error( 'Cannot read parquet files from HDFS: {} \n {}'.format( type(err).__name__, str(err))) continue except Exception as err: logging.error( 'An error occured while fetching \"/data/listenbrainz/{}/{}.parquet\": {} \n {}. Aborting...' .format(y, m, type(err).__name__, str(err)), exc_info=True) sys.exit(-1) if not df: raise SystemExit("Parquet files from {}-{} to {}-{} are empty".format( config.STARTING_YEAR, '{:02d}'.format(config.STARTING_MONTH), config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH))) logging.info('Registering Dataframe...') table = 'df_to_train_{}'.format( datetime.strftime(datetime.utcnow(), '%Y_%m_%d')) try: df.createOrReplaceTempView(table) except AnalysisException as err: logging.error( 'Cannot register dataframe: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while registering dataframe: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) t = '{:.2f}'.format(time() - ti) logging.info( 'Files fectched from HDFS and dataframe registered in {}s'.format(t)) dest_path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz', 'recommendation-engine', 'dataframes') logging.info('Preparing users data and saving to HDFS...') try: t0 = time() users_df = prepare_user_data(table) users_df.write.format('parquet').save(dest_path + '/users_df.parquet', mode='overwrite') except QueryExecutionException as err: logging.error( 'Failed to execute users query: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except AnalysisException as err: logging.error( 'Failed to analyse users query plan: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except ParseException as err: logging.error( "Failed to parse SQL command: {} \n {}. Aborting...".format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while executing users query: {} \n {}. Aborting' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) users_df_time = '{:.2f}'.format((time() - t0) / 60) logging.info('Preparing recordings data and saving to HDFS...') try: t0 = time() recordings_df = prepare_recording_data(table) recordings_df.write.format('parquet').save(dest_path + '/recordings_df.parquet', mode='overwrite') except QueryExecutionException as err: logging.error( 'Failed to execute recordings query: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except AnalysisException as err: logging.error( 'Failed to analyse recordings query plan: {} \n {}. Aborting...'. format(type(err).__name__, str(err))) sys.exit(-1) except ParseException as err: logging.error( 'Failed to parse SQL command: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while executing recordings query: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) recordings_df_time = '{:.2f}'.format((time() - t0) / 60) logging.info( 'Preparing listen data dump and playcounts, saving playcounts to HDFS...' ) try: t0 = time() listens_df = prepare_listen_data(table) playcounts_df = get_playcounts_data(listens_df, users_df, recordings_df) playcounts_df.write.format('parquet').save(dest_path + '/playcounts_df.parquet', mode='overwrite') except QueryExecutionException as err: logging.error( 'Failed to execute playcounts query: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except AnalysisException as err: logging.error( 'Failed to analyse playcounts query plan: {} \n {}. Aborting...'. format(type(err).__name__, str(err))) sys.exit(-1) except ParseException as err: logging.error( 'Failed to parse SQL command: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error('An error occurred. {} \n {}. Aborting...'.format( type(err).__name__, str(err)), exc_info=True) sys.exit(-1) playcounts_df_time = '{:.2f}'.format((time() - t0) / 60) total_time = '{:.2f}'.format((time() - ti) / 60) lb_dump_time_window = ('{}-{}'.format( config.STARTING_YEAR, '{:02d}'.format(config.STARTING_MONTH)), '{}-{}'.format( config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH))) date = datetime.utcnow().strftime('%Y-%m-%d') queries_html = 'Queries-{}-{}.html'.format(uuid.uuid4(), date) context = { 'users_df_time': users_df_time, 'recordings_df_time': recordings_df_time, 'playcounts_df_time': playcounts_df_time, 'lb_dump_time_window': lb_dump_time_window, 'missing_parquets': missing_parquets, 'total_time': total_time } utils.save_html(queries_html, context, 'queries.html')
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Train_Models') except AttributeError as err: logging.error( 'Cannot initialize Spark Session: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while initializing Spark session: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) try: path = os.path.join('/', 'data', 'listenbrainz', 'recommendation-engine', 'dataframes', 'playcounts_df.parquet') playcounts_df = listenbrainz_spark.sql_context.read.parquet( config.HDFS_CLUSTER_URI + path) except AnalysisException as err: logging.error('Cannot read parquet file from HDFS: {} \n {}'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while fetching parquet: {} \n {}. Aborting...'. format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) time_info = {} time_info['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60) t0 = time() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_info['preprocessing'] = '{:.2f}'.format((time() - t0) / 60) # Rdds that are used in model training iterative process are cached to improve performance. # Caching large files may cause Out of Memory exception. training_data.persist() validation_data.persist() num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() logging.info('Training models...') try: t0 = time() model, model_metadata, best_model_metadata = train( training_data, validation_data, num_validation, config.RANKS, config.LAMBDAS, config.ITERATIONS) models_training_time = '{:.2f}'.format((time() - t0) / 3600) except Py4JJavaError as err: logging.error('Unable to train models: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while training models: {} \n {}. Aborting...'. format(type(err).__name__, str(err), exc_info=True)) sys.exit(-1) training_data.unpersist() validation_data.unpersist() logging.info('Saving model...') try: t0 = time() path = os.path.join('/', 'data', 'listenbrainz', 'recommendation-engine', 'best-model', '{}'.format(best_model_metadata['model_id'])) model.model.save(listenbrainz_spark.context, config.HDFS_CLUSTER_URI + path) time_info['save_model'] = '{:.2f}'.format((time() - t0) / 60) except Py4JJavaError as err: logging.error("Unable to save model: {} \n {}. Aborting...".format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while saving model: {} \n {}. Aborting...'. format(type(err).__name__, str(err), exc_info=True)) sys.exit(-1) date = datetime.utcnow().strftime('%Y-%m-%d') model_html = 'Model-{}-{}.html'.format(uuid.uuid4(), date) context = { 'time': time_info, 'num_training': '{:,}'.format(num_training), 'num_validation': '{:,}'.format(num_validation), 'num_test': '{:,}'.format(num_test), 'models': model_metadata, 'best_model': best_model_metadata, 'models_training_time': models_training_time, 'total_time': '{:.2f}'.format((time() - ti) / 3600) } utils.save_html(model_html, context, 'model.html') path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(path, 'r') as f: recommendation_metadata = json.load(f) recommendation_metadata['best_model_id'] = best_model_metadata[ 'model_id'] with open(path, 'w') as f: json.dump(recommendation_metadata, f)
def main(app_name): listenbrainz_spark.init_spark_session(app_name) RequestConsumer().run()
def main(app_name, archive): listenbrainz_spark.init_spark_session(app_name) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) print('Copying extracted dump to HDFS...') copy_to_hdfs(archive) print('Done!')
def main(app_name): listenbrainz_spark.init_spark_session(app_name) global rc rc = RequestConsumer() rc.run()
def calculate_dataframes(from_date, to_date, job_type, minimum_listens_threshold): if job_type == "recommendation_recording": paths = { "mapped_listens": path.RECOMMENDATION_RECORDING_MAPPED_LISTENS, "playcounts": path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME, "recordings": path.RECOMMENDATION_RECORDINGS_DATAFRAME, "users": path.RECOMMENDATION_RECORDING_USERS_DATAFRAME, "metadata": path.RECOMMENDATION_RECORDING_DATAFRAME_METADATA, "prefix": "listenbrainz-dataframe-recording-recommendations" } elif job_type == "similar_users": paths = { "mapped_listens": path.USER_SIMILARITY_MAPPED_LISTENS, "playcounts": path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME, "recordings": path.USER_SIMILARITY_RECORDINGS_DATAFRAME, "users": path.USER_SIMILARITY_USERS_DATAFRAME, "metadata": path.USER_SIMILARITY_METADATA_DATAFRAME, "prefix": "listenbrainz-dataframe-user-similarity" } else: raise SparkException( "Invalid job_type parameter received for creating dataframes: " + job_type) # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) raise metadata['to_date'] = to_date metadata['from_date'] = from_date complete_listens_df = get_listens_from_new_dump(from_date, to_date) logger.info( f'Listen count from {from_date} to {to_date}: {complete_listens_df.count()}' ) logger.info('Discarding listens without mbids...') partial_listens_df = complete_listens_df.where( col('recording_mbid').isNotNull()) logger.info(f'Listen count after discarding: {partial_listens_df.count()}') logger.info('Thresholding listens...') threshold_listens_df = get_threshold_listens_df(partial_listens_df, paths["mapped_listens"], minimum_listens_threshold) logger.info( f'Listen count after thresholding: {threshold_listens_df.count()}') logger.info('Preparing users data and saving to HDFS...') users_df = get_users_dataframe(threshold_listens_df, metadata, paths["users"]) logger.info('Preparing recordings data and saving to HDFS...') recordings_df = get_recordings_df(threshold_listens_df, metadata, paths["recordings"]) logger.info( 'Preparing listen data dump and playcounts, saving playcounts to HDFS...' ) listens_df = get_listens_df(threshold_listens_df, metadata) save_playcounts_df(listens_df, recordings_df, users_df, metadata, paths["playcounts"]) metadata['dataframe_id'] = get_dataframe_id(paths["prefix"]) save_dataframe_metadata_to_hdfs(metadata, paths["metadata"]) return complete_listens_df
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Candidate_set') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) df = get_listens_for_rec_generation_window() if not df: current_app.logger.error( 'Listening history of past {} days do not exist'.format( config.RECOMMENDATION_GENERATION_WINDOW)) try: utils.register_dataframe(df, 'df') except ViewNotRegisteredException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) try: listens_df = sql.get_listens_for_X_days() except SQLException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) try: artists_relation_df = utils.read_files_from_HDFS( path.SIMILAR_ARTIST_DATAFRAME_PATH) recordings_df = utils.read_files_from_HDFS( path.RECORDINGS_DATAFRAME_PATH) users_df = utils.read_files_from_HDFS(path.USERS_DATAFRAME_PATH) except PathNotFoundException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) current_app.logger.info('Registering Dataframes...') try: utils.register_dataframe(listens_df, 'listens_df') utils.register_dataframe(recordings_df, 'recording') utils.register_dataframe(users_df, 'user') utils.register_dataframe(artists_relation_df, 'artists_relation') except ViewNotRegisteredException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) current_app.logger.info( 'Files fetched from HDFS and dataframes registered in {}s'.format( '{:.2f}'.format(time() - ti))) metadata_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(metadata_file_path) as f: recommendation_metadata = json.load(f) user_names = recommendation_metadata['user_name'] user_data = defaultdict(dict) similar_artists_candidate_set_df = None top_artists_candidate_set_df = None for user_name in user_names: ts = time() try: user_id = get_user_id(user_name) except TypeError as err: current_app.logger.error( '{}: Invalid user name. User "{}" does not exist.'.format( type(err).__name__, user_name)) continue except SQLException as err: current_app.logger.error( 'User id for "{}" cannot be retrieved\n{}'.format( user_name, str(err)), exc_info=True) continue try: top_artists_df = sql.get_top_artists(user_name) top_artists_df.take(1)[0] except IndexError as err: current_app.logger.error('{}: {}\nNo top artists found, i.e. "{}" is either a new user or has empty listening history.' \ ' Candidate sets cannot be generated'.format(type(err).__name__, str(err), user_name)) continue except SQLException as err: current_app.logger.error( 'Top artists cannot be retrieved for "{}": {}\n{}'.format( user_name, str(err)), exc_info=True) continue try: similar_artists_df = get_similar_artists(top_artists_df, user_name) except IndexError as err: current_app.logger.error( '{}\nGenrating recommendations for next user'.format(err)) continue except SQLException as err: current_app.logger.error( 'Candidate sets not generated for "{}"\n{}'.format( user_name, str(err)), exc_info=True) continue try: utils.register_dataframe(similar_artists_df, 'similar_artist') utils.register_dataframe(top_artists_df, 'top_artist') except ViewNotRegisteredException as err: current_app.logger.error(str(err), exc_info=True) continue try: top_artists_recording_ids_df = get_top_artists_recording_ids( similar_artists_df, user_name, user_id) except SQLException as err: current_app.logger.error( 'Candidate sets could not be generated for "{}"\n{}'.format( user_name, str(err)), exc_info=True) continue top_artists_candidate_set_df = top_artists_candidate_set_df.union(top_artists_recording_ids_df) \ if top_artists_candidate_set_df else top_artists_recording_ids_df try: similar_artists_recording_ids_df = get_similar_artists_recording_ids( similar_artists_df, top_artists_df, user_name, user_id) except IndexError as err: current_app.logger.error( '{}\nGenrating recommendations for next user'.format(err)) continue except SQLException as err: current_app.logger.error( 'Candidate sets could not be generated for "{}"\n{}'.format( user_name, str(err)), exc_info=True) continue similar_artists_candidate_set_df = similar_artists_candidate_set_df.union(similar_artists_recording_ids_df) \ if similar_artists_candidate_set_df else similar_artists_recording_ids_df if SAVE_CANDIDATE_HTML: user_data[user_name]['artists'] = get_candidate_html_data( similar_artists_df, user_name) user_data[user_name]['time'] = '{:.2f}'.format(time() - ts) current_app.logger.info( 'candidate_set generated for \"{}\"'.format(user_name)) try: save_candidate_sets(top_artists_candidate_set_df, similar_artists_candidate_set_df) except Py4JJavaError as err: current_app.logger.error('{}\nAborting...'.format( str(err.java_exception)), exc_info=True) sys.exit(-1) if SAVE_CANDIDATE_HTML: try: save_candidate_html(user_data) except SQLException as err: current_app.logger.error( 'Could not save candidate HTML\n{}'.format(str(err)), exc_info=True) sys.exit(-1)
def main(recommendation_top_artist_limit=None, recommendation_similar_artist_limit=None, users=None): try: listenbrainz_spark.init_spark_session('Recommendations') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) raise try: recordings_df = utils.read_files_from_HDFS( path.RECOMMENDATION_RECORDINGS_DATAFRAME) top_artist_candidate_set_df = utils.read_files_from_HDFS( path.RECOMMENDATION_RECORDING_TOP_ARTIST_CANDIDATE_SET) similar_artist_candidate_set_df = utils.read_files_from_HDFS( path.RECOMMENDATION_RECORDING_SIMILAR_ARTIST_CANDIDATE_SET) except PathNotFoundException as err: logger.error(str(err), exc_info=True) raise except FileNotFetchedException as err: logger.error(str(err), exc_info=True) raise logger.info('Loading model...') model = load_model() # an action must be called to persist data in memory recordings_df.count() recordings_df.persist() params = RecommendationParams(recordings_df, model, top_artist_candidate_set_df, similar_artist_candidate_set_df, recommendation_top_artist_limit, recommendation_similar_artist_limit) try: # timestamp when the script was invoked ts_initial = time.monotonic() users_df = get_user_name_and_user_id(params, users) # Some users are excluded from the top_artist_candidate_set because of the limited data # in the mapping. Therefore, active_user_count may or may not be equal to number of users # active in the last week. Ideally, top_artist_candidate_set should give the active user count. active_user_count = users_df.count() users_df.persist() logger.info( 'Took {:.2f}sec to get active user count'.format(time.monotonic() - ts_initial)) except EmptyDataframeExcpetion as err: logger.error(str(err), exc_info=True) raise logger.info('Generating recommendations...') ts = time.monotonic() top_artist_rec_df, similar_artist_rec_df = get_recommendations_for_all( params, users) logger.info('Recommendations generated!') logger.info( 'Took {:.2f}sec to generate recommendations for all active users'. format(time.monotonic() - ts)) ts = time.monotonic() top_artist_rec_user_count = get_user_count(top_artist_rec_df) similar_artist_rec_user_count = get_user_count(similar_artist_rec_df) logger.info( 'Took {:.2f}sec to get top artist and similar artist user count'. format(time.monotonic() - ts)) ts = time.monotonic() check_for_ratings_beyond_range(top_artist_rec_df, similar_artist_rec_df) top_artist_rec_scaled_df = scale_rating(top_artist_rec_df) similar_artist_rec_scaled_df = scale_rating(similar_artist_rec_df) logger.info('Took {:.2f}sec to scale the ratings'.format(time.monotonic() - ts)) ts = time.monotonic() top_artist_rec_mbid_df = get_recording_mbids(params, top_artist_rec_scaled_df, users_df) similar_artist_rec_mbid_df = get_recording_mbids( params, similar_artist_rec_scaled_df, users_df) logger.info( 'Took {:.2f}sec to get mbids corresponding to recording ids'.format( time.monotonic() - ts)) # persisted data must be cleared from memory after usage to avoid OOM recordings_df.unpersist() total_time = time.monotonic() - ts_initial logger.info('Total time: {:.2f}sec'.format(total_time)) result = create_messages(top_artist_rec_mbid_df, similar_artist_rec_mbid_df, active_user_count, total_time, top_artist_rec_user_count, similar_artist_rec_user_count) users_df.unpersist() return result
def main(recommendation_generation_window=None, top_artist_limit=None, similar_artist_limit=None, users=None, html_flag=False): time_initial = time.monotonic() try: listenbrainz_spark.init_spark_session('Candidate_set') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) raise try: mapped_listens_df = utils.read_files_from_HDFS( path.RECOMMENDATION_RECORDING_MAPPED_LISTENS) recordings_df = utils.read_files_from_HDFS( path.RECOMMENDATION_RECORDINGS_DATAFRAME) users_df = utils.read_files_from_HDFS( path.RECOMMENDATION_RECORDING_USERS_DATAFRAME) artist_relation_df = utils.read_files_from_HDFS( path.SIMILAR_ARTIST_DATAFRAME_PATH) except PathNotFoundException as err: logger.error(str(err), exc_info=True) raise except FileNotFetchedException as err: logger.error(str(err), exc_info=True) raise from_date, to_date = get_dates_to_generate_candidate_sets( mapped_listens_df, recommendation_generation_window) logger.info('Fetching listens to get top artists...') mapped_listens_subset = get_listens_to_fetch_top_artists( mapped_listens_df, from_date, to_date) logger.info('Fetching top artists...') top_artist_df = get_top_artists(mapped_listens_subset, top_artist_limit, users) logger.info('Preparing top artists candidate set...') top_artist_candidate_set_df, top_artist_candidate_set_df_html = get_top_artist_candidate_set( top_artist_df, recordings_df, users_df, mapped_listens_subset) logger.info('Fetching similar artists...') similar_artist_df, similar_artist_df_html = get_similar_artists( top_artist_df, artist_relation_df, similar_artist_limit) logger.info('Preparing similar artists candidate set...') similar_artist_candidate_set_df, similar_artist_candidate_set_df_html = get_similar_artist_candidate_set( similar_artist_df, recordings_df, users_df, mapped_listens_subset) logger.info('Saving candidate sets...') save_candidate_sets(top_artist_candidate_set_df, similar_artist_candidate_set_df) logger.info('Done!') # time taken to generate candidate_sets total_time = '{:.2f}'.format((time.monotonic() - time_initial) / 60) if html_flag: user_data = get_candidate_html_data( similar_artist_candidate_set_df_html, top_artist_candidate_set_df_html, top_artist_df, similar_artist_df_html) logger.info('Saving HTML...') save_candidate_html(user_data, total_time, from_date, to_date) logger.info('Done!') message = [{ 'type': 'cf_recommendations_recording_candidate_sets', 'candidate_sets_upload_time': str(datetime.utcnow()), 'total_time': total_time, 'from_date': str(from_date), 'to_date': str(to_date) }] return message
def main(): ti = time() time_ = defaultdict(dict) try: listenbrainz_spark.init_spark_session('Train Models') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) # Add checkpoint dir to break and save RDD lineage. listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI + path.CHECKPOINT_DIR) try: playcounts_df = utils.read_files_from_HDFS( path.PLAYCOUNTS_DATAFRAME_PATH) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) time_['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60) t0 = time() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_['preprocessing'] = '{:.2f}'.format((time() - t0) / 60) # Rdds that are used in model training iterative process are cached to improve performance. # Caching large files may cause Out of Memory exception. training_data.persist() validation_data.persist() # An action must be called for persist to evaluate. num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() current_app.logger.info('Training models...') t0 = time() model, model_metadata, best_model_metadata = train( training_data, validation_data, num_validation, config.RANKS, config.LAMBDAS, config.ITERATIONS) models_training_time = '{:.2f}'.format((time() - t0) / 3600) try: best_model_test_rmse = compute_rmse(model.model, test_data, num_test) except Py4JJavaError as err: current_app.logger.error( 'Root mean squared error for best model for test data not computed\n{}\nAborting...' .format(str(err.java_exception)), exc_info=True) sys.exit(-1) # Cached data must be cleared to avoid OOM. training_data.unpersist() validation_data.unpersist() current_app.logger.info('Saving model...') t0 = time() model_save_path = os.path.join(path.DATA_DIR, best_model_metadata['model_id']) save_model(model_save_path, best_model_metadata['model_id'], model) time_['save_model'] = '{:.2f}'.format((time() - t0) / 60) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway. try: utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) except HDFSDirectoryNotDeletedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) if SAVE_TRAINING_HTML: save_training_html(time_, num_training, num_validation, num_test, model_metadata, best_model_metadata, ti, models_training_time) # Save best model id to a JSON file metadata_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(metadata_file_path, 'r') as f: recommendation_metadata = json.load(f) recommendation_metadata['best_model_id'] = best_model_metadata[ 'model_id'] with open(metadata_file_path, 'w') as f: json.dump(recommendation_metadata, f)
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Candidate_set') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) try: mapped_df = utils.read_files_from_HDFS(path.MAPPED_LISTENS) recordings_df = utils.read_files_from_HDFS( path.RECORDINGS_DATAFRAME_PATH) users_df = utils.read_files_from_HDFS(path.USERS_DATAFRAME_PATH) artists_relation_df = utils.read_files_from_HDFS( path.SIMILAR_ARTIST_DATAFRAME_PATH) except PathNotFoundException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) listens_df = get_listens_for_rec_generation_window(mapped_df) metadata_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(metadata_file_path) as f: recommendation_metadata = json.load(f) user_names = recommendation_metadata['user_name'] user_data = defaultdict(dict) similar_artists_candidate_set_df = None top_artists_candidate_set_df = None for user_name in user_names: ts = time() try: user_id = get_user_id(users_df, user_name) except IndexError: current_app.logger.error( '{} is new/invalid user'.format(user_name)) continue top_artists_df = get_top_artists(listens_df, user_name) top_artists_recording_ids_df = get_top_artists_recording_ids( top_artists_df, recordings_df, user_id) top_artists_candidate_set_df = top_artists_candidate_set_df.union(top_artists_recording_ids_df) \ if top_artists_candidate_set_df else top_artists_recording_ids_df try: similar_artists_df = get_similar_artists(top_artists_df, artists_relation_df, user_name) except IndexError: continue similar_artists_recording_ids_df = get_similar_artists_recording_ids( similar_artists_df, recordings_df, user_id) similar_artists_candidate_set_df = similar_artists_candidate_set_df.union(similar_artists_recording_ids_df) \ if similar_artists_candidate_set_df else similar_artists_recording_ids_df if SAVE_CANDIDATE_HTML: user_data[user_name]['artists'] = get_candidate_html_data( similar_artists_df, user_name) user_data[user_name]['time'] = '{:.2f}'.format(time() - ts) current_app.logger.info( 'candidate_set generated for \"{}\"'.format(user_name)) try: save_candidate_sets(top_artists_candidate_set_df, similar_artists_candidate_set_df) except Py4JJavaError as err: current_app.logger.error('{}\nAborting...'.format( str(err.java_exception)), exc_info=True) sys.exit(-1) if SAVE_CANDIDATE_HTML: try: save_candidate_html(user_data) except SQLException as err: current_app.logger.error( 'Could not save candidate HTML\n{}'.format(str(err)), exc_info=True) sys.exit(-1)
def __init__(self): try: listenbrainz_spark.init_spark_session('uploader') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) sys.exit(-1)
def main(ranks=None, lambdas=None, iterations=None, alpha=None): if ranks is None: current_app.logger.critical('model param "ranks" missing') if lambdas is None: current_app.logger.critical('model param "lambdas" missing') raise if iterations is None: current_app.logger.critical('model param "iterations" missing') raise if alpha is None: current_app.logger.critical('model param "alpha" missing') raise ti = time.monotonic() time_ = defaultdict(dict) try: listenbrainz_spark.init_spark_session('Train Models') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise # Add checkpoint dir to break and save RDD lineage. listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI + path.CHECKPOINT_DIR) try: playcounts_df = utils.read_files_from_HDFS( path.PLAYCOUNTS_DATAFRAME_PATH) dataframe_metadata_df = utils.read_files_from_HDFS( path.DATAFRAME_METADATA) except PathNotFoundException as err: current_app.logger.error( '{}\nConsider running create_dataframes.py'.format(str(err)), exc_info=True) raise except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) raise time_['load_playcounts'] = '{:.2f}'.format((time.monotonic() - ti) / 60) t0 = time.monotonic() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_['preprocessing'] = '{:.2f}'.format((time.monotonic() - t0) / 60) # An action must be called for persist to evaluate. num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() t0 = time.monotonic() best_model, model_metadata = get_best_model(training_data, validation_data, num_validation, ranks, lambdas, iterations, alpha) models_training_time = '{:.2f}'.format((time.monotonic() - t0) / 3600) best_model_metadata = get_best_model_metadata(best_model) current_app.logger.info( "Calculating test RMSE for best model with model id: {}".format( best_model.model_id)) best_model_metadata['test_rmse'] = compute_rmse(best_model.model, test_data, num_test, best_model.model_id) current_app.logger.info("Test RMSE calculated!") best_model_metadata['training_data_count'] = num_training best_model_metadata['validation_data_count'] = num_validation best_model_metadata['test_data_count'] = num_test best_model_metadata['dataframe_id'] = get_latest_dataframe_id( dataframe_metadata_df) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) t0 = time.monotonic() save_model(best_model.model_id, best_model.model) time_['save_model'] = '{:.2f}'.format((time.monotonic() - t0) / 60) save_model_metadata_to_hdfs(best_model_metadata) # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway. try: utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) except HDFSDirectoryNotDeletedException as err: current_app.logger.error(str(err), exc_info=True) raise if SAVE_TRAINING_HTML: current_app.logger.info('Saving HTML...') save_training_html(time_, num_training, num_validation, num_test, model_metadata, best_model_metadata, ti, models_training_time) current_app.logger.info('Done!') message = [{ 'type': 'cf_recording_model', 'model_upload_time': str(datetime.utcnow()), 'total_time': '{:.2f}'.format(time.monotonic() - ti), }] return message
def main(): ti = time() # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) df = get_listens_for_training_model_window(metadata) if not df: current_app.logger.error( 'Parquet files containing listening history of past {} days missing form HDFS' .format(config.TRAIN_MODEL_WINDOW)) sys.exit(-1) current_app.logger.info('Registering Dataframe...') table = 'df_to_train_{}'.format( datetime.strftime(datetime.utcnow(), '%Y_%m_%d')) try: utils.register_dataframe(df, table) except ViewNotRegisteredException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) current_app.logger.info( 'Files fetched from HDFS and dataframe registered in {}s'.format( '{:.2f}'.format(time() - ti))) current_app.logger.info('Preparing users data and saving to HDFS...') t0 = time() try: users_df = sql.prepare_user_data(table) except SQLException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) metadata['users_count'] = users_df.count() try: utils.save_parquet(users_df, path.USERS_DATAFRAME_PATH) except FileNotSavedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) users_df_time = '{:.2f}'.format((time() - t0) / 60) current_app.logger.info('Preparing recordings data and saving to HDFS...') t0 = time() try: recordings_df = sql.prepare_recording_data(table) except SQLException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) metadata['recordings_count'] = recordings_df.count() try: utils.save_parquet(recordings_df, path.RECORDINGS_DATAFRAME_PATH) except FileNotSavedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) recordings_df_time = '{:.2f}'.format((time() - t0) / 60) current_app.logger.info( 'Preparing listen data dump and playcounts, saving playcounts to HDFS...' ) t0 = time() try: listens_df = sql.prepare_listen_data(table) except SQLException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) metadata['listens_count'] = listens_df.count() try: utils.register_dataframe(listens_df, 'listen') utils.register_dataframe(users_df, 'user') utils.register_dataframe(recordings_df, 'recording') except ViewNotRegisteredException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) try: playcounts_df = sql.get_playcounts_data() except SQLException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) metadata['playcounts_count'] = playcounts_df.count() try: utils.save_parquet(playcounts_df, path.PLAYCOUNTS_DATAFRAME_PATH) except FileNotSavedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) playcounts_df_time = '{:.2f}'.format((time() - t0) / 60) total_time = '{:.2f}'.format((time() - ti) / 60) generate_best_model_id(metadata) save_dataframe_metadata_to_HDFS(metadata) if SAVE_DATAFRAME_HTML: save_dataframe_html(users_df_time, recordings_df_time, playcounts_df_time, total_time)
def main(train_model_window, job_type, minimum_listens_threshold=0): if job_type == "recommendation_recording": paths = { "mapped_listens": path.RECOMMENDATION_RECORDING_MAPPED_LISTENS, "playcounts": path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME, "recordings": path.RECOMMENDATION_RECORDINGS_DATAFRAME, "users": path.RECOMMENDATION_RECORDING_USERS_DATAFRAME, "metadata": path.RECOMMENDATION_RECORDING_DATAFRAME_METADATA, "prefix": "listenbrainz-dataframe-recording-recommendations" } elif job_type == "similar_users": paths = { "mapped_listens": path.USER_SIMILARITY_MAPPED_LISTENS, "playcounts": path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME, "recordings": path.USER_SIMILARITY_RECORDINGS_DATAFRAME, "users": path.USER_SIMILARITY_USERS_DATAFRAME, "metadata": path.USER_SIMILARITY_METADATA_DATAFRAME, "prefix": "listenbrainz-dataframe-user-similarity" } else: raise SparkException("Invalid job_type parameter received for creating dataframes: " + job_type) ti = time.monotonic() # dict to save dataframe metadata which would be later merged in model_metadata dataframe. metadata = {} # "updated" should always be set to False in this script. metadata['updated'] = False try: listenbrainz_spark.init_spark_session('Create Dataframes') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise current_app.logger.info('Fetching listens to create dataframes...') to_date, from_date = get_dates_to_train_data(train_model_window) metadata['to_date'] = to_date metadata['from_date'] = from_date partial_listens_df = get_listens_for_training_model_window(to_date, from_date, path.LISTENBRAINZ_DATA_DIRECTORY) current_app.logger.info('Listen count from {from_date} to {to_date}: {listens_count}' .format(from_date=from_date, to_date=to_date, listens_count=partial_listens_df.count())) current_app.logger.info('Loading mapping from HDFS...') df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING) msid_mbid_mapping_df = mapping_utils.get_unique_rows_from_mapping(df) current_app.logger.info('Number of distinct rows in the mapping: {}'.format(msid_mbid_mapping_df.count())) current_app.logger.info('Mapping listens...') mapped_listens_df = get_mapped_artist_and_recording_mbids(partial_listens_df, msid_mbid_mapping_df, paths["mapped_listens"]) current_app.logger.info('Listen count after mapping: {}'.format(mapped_listens_df.count())) current_app.logger.info('Preparing users data and saving to HDFS...') users_df = get_users_dataframe(mapped_listens_df, metadata, paths["users"]) current_app.logger.info('Preparing recordings data and saving to HDFS...') recordings_df = get_recordings_df(mapped_listens_df, metadata, paths["recordings"]) current_app.logger.info('Preparing listen data dump and playcounts, saving playcounts to HDFS...') listens_df = get_listens_df(mapped_listens_df, metadata) save_playcounts_df(listens_df, recordings_df, users_df, minimum_listens_threshold, metadata, paths["playcounts"]) metadata['dataframe_id'] = get_dataframe_id(paths["prefix"]) save_dataframe_metadata_to_hdfs(metadata, paths["metadata"]) current_app.logger.info('Preparing missing MusicBrainz data...') missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df) messages = prepare_messages(missing_musicbrainz_data_itr, from_date, to_date, ti) return messages
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Recommendations') except AttributeError as err: logging.error( 'Cannot initialize Spark Session: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while initializing Spark session: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) try: path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz', 'recommendation-engine', 'dataframes') playcounts_df = listenbrainz_spark.sql_context.read.parquet( path + '/playcounts_df.parquet') users_df = listenbrainz_spark.sql_context.read.parquet( path + '/users_df.parquet') recordings_df = listenbrainz_spark.sql_context.read.parquet( path + '/recordings_df.parquet') except AnalysisException as err: logging.error('Cannot read parquet files from HDFS: {} \n {}'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while fetching parquets: {} \n {}. Aborting...'. format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) time_info = defaultdict(dict) time_info['dataframes'] = '{:.2f}'.format((time() - ti) / 60) try: users_df.createOrReplaceTempView('user') playcounts_df.createOrReplaceTempView('playcount') except AnalysisException as err: logging.error( 'Cannot register dataframes: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while registering dataframes: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) t0 = time() all_recordings = recordings_df.select('recording_id') all_recordings.persist() all_recordings_count = '{:,}'.format(all_recordings.count()) time_info['all_recordings'] = '{:.2f}'.format((time() - t0) / 60) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(path, 'r') as f: recommendation_metadata = json.load(f) best_model_id = recommendation_metadata['best_model_id'] best_model_path = os.path.join('/', 'data', 'listenbrainz', 'recommendation-engine', 'best-model', '{}'.format(best_model_id)) logging.info('Loading model...') try: t0 = time() model = load_model(config.HDFS_CLUSTER_URI + best_model_path) time_info['load_model'] = '{:.2f}'.format((time() - t0) / 60) except Py4JJavaError as err: logging.error('Unable to load model: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while loading model: {} \n {}. Aborting...'. format(type(err).__name__, str(err), exc_info=True)) sys.exit(-1) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') ts = time() with open(path) as f: recommendation_metadata = json.load(f) recommendations = defaultdict(dict) for user_name in recommendation_metadata['user_name']: try: t0 = time() user_recommendations = recommend_user(user_name, model, all_recordings, recordings_df) user_recommendations['total-time'] = '{:.2f}'.format( (time() - t0) / 60) logging.info( 'Recommendations for "{}" generated'.format(user_name)) recommendations[user_name] = user_recommendations except TypeError as err: logging.error( '{}: Invalid user name. User "{}" does not exist.'.format( type(err).__name__, user_name)) except Exception as err: logging.error( 'Recommendations for "{}" not generated.\n{}'.format( user_name, str(err)), exc_info=True) time_info['total_recommendation_time'] = '{:.2f}'.format( (time() - ts) / 3600) all_recordings.unpersist() date = datetime.utcnow().strftime('%Y-%m-%d') recommendation_html = 'Recommendation-{}-{}.html'.format( uuid.uuid4(), date) column = ('Track Name', 'Recording MSID', 'Artist Name', 'Artist MSID', 'Release Name', 'Release MSID') context = { 'recommendations': recommendations, 'column': column, 'total_time': '{:.2f}'.format((time() - ti) / 3600), 'time': time_info, 'best_model': best_model_id, 'all_recordings_count': all_recordings_count, } utils.save_html(recommendation_html, context, 'recommend.html')