예제 #1
0
def save_training_html(time_, num_training, num_validation, num_test,
                       model_metadata, best_model_metadata, ti,
                       models_training_time):
    """ Prepare and save taraining HTML.

        Args:
            time_ (dict): Dictionary containing execution time information.
            num_training (int): Number of elements/rows in training_data.
            num_validation (int): Number of elements/rows in validation_data.
            num_test (int): Number of elements/rows in test_data.
            model_metadata (dict): Models information such as model id, error etc.
            best_model_metadata (dict): Best Model information such as model id, error etc.
            ti (str): Value of the monotonic clock when the script was run.
            models_training_data (str): Time taken to train all the models.
    """
    date = datetime.utcnow().strftime('%Y-%m-%d')
    model_html = 'Model-{}-{}.html'.format(uuid.uuid4(), date)
    context = {
        'time': time_,
        'num_training': '{:,}'.format(num_training),
        'num_validation': '{:,}'.format(num_validation),
        'num_test': '{:,}'.format(num_test),
        'models': model_metadata,
        'best_model': best_model_metadata,
        'models_training_time': models_training_time,
        'total_time': '{:.2f}'.format((time.monotonic() - ti) / 3600)
    }
    save_html(model_html, context, 'model.html')
예제 #2
0
def save_candidate_html(user_data, total_time, from_date, to_date):
    """ Save user data to an HTML file.

        Args:
            user_data (dict): Top and similar artists associated to users.
            total_time (str): time taken to generate candidate_sets
    """
    date = datetime.utcnow().strftime('%Y-%m-%d')
    candidate_html = 'Candidate-{}-{}.html'.format(uuid.uuid4(), date)
    context = {
        'user_data': user_data,
        'total_time': total_time,
        'from_date': from_date,
        'to_date': to_date
    }
    save_html(candidate_html, context, 'candidate.html')
예제 #3
0
def save_dataframe_html(users_df_time, recordings_df_time, playcounts_df_time,
                        total_time):
    """ Prepare and save dataframe HTML.

        Args:
            users_df_time (str): Time taken to prepare and save users dataframe.
            recordings_df_time (str): Time taken to prepare and save recordings dataframe.
            playcounts_df_time (str): TIme taken to prepare and save playcounts dataframe.
            total_time (str): Time taken to execute the script.
    """
    date = datetime.utcnow().strftime('%Y-%m-%d')
    queries_html = 'Queries-{}-{}.html'.format(uuid.uuid4(), date)
    context = {
        'users_df_time': users_df_time,
        'recordings_df_time': recordings_df_time,
        'playcounts_df_time': playcounts_df_time,
        'total_time': total_time
    }
    save_html(queries_html, context, 'queries.html')
def save_candidate_html(user_data):
    """ Save user data to an HTML file.

        Args:
            user_data (dict): Dictionary can be depicted as:
                {
                    'user_name 1': {
                        'artists': {
                            'top_artists 1' : ['similar_artist 1', 'similar_artist 2' ... 'similar_artist x'],
                        ...
                        'top_artists y' : ['similar_artist 1', 'similar_artist 2' ... 'similar_artist x'],
                        },
                        'time' : 'xxx'
                    },
                }
    """
    date = datetime.utcnow().strftime('%Y-%m-%d')
    candidate_html = 'Candidate-{}-{}.html'.format(uuid.uuid4(), date)
    context = {'user_data': user_data}
    save_html(candidate_html, context, 'candidate.html')
예제 #5
0
def get_recommendation_html(recommendations, time_, best_model_id, ti):
    """ Prepare and save recommendation HTML.

        Args:
            time_ (dict): Dictionary containing execution time information, can be depicted as:
                {
                    'load_model' : '3.09',
                    ...
                }
            best_model_id (str): Id of the model used for generating recommendations
            ti (str): Seconds since epoch when the script was run.
            recommendations (dict): Dictionary can be depicted as:
                {
                    'user_name 1': {
                        'time': 'xx.xx',
                        'top_artists_recordings': [
                            ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx'),
                            ...
                            ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx')
                        ]
                        'similar_artists_recordings' : [
                            ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx'),
                            ...
                            ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx')
                        ]
                    }
                }
    """
    date = datetime.utcnow().strftime('%Y-%m-%d')
    recommendation_html = 'Recommendation-{}-{}.html'.format(
        uuid.uuid4(), date)
    column = ('Track Name', 'Recording MSID', 'Artist Name', 'Artist MSID',
              'Release Name', 'Release MSID')
    context = {
        'recommendations': recommendations,
        'column': column,
        'total_time': '{:.2f}'.format((time() - ti) / 3600),
        'time': time_,
        'best_model': best_model_id,
    }
    save_html(recommendation_html, context, 'recommend.html')
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Create_Dataframe')
    except AttributeError as err:
        logging.error(
            'Cannot initialize Spark Session: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while initializing Spark session: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    df = None
    missing_parquets = []
    for y in range(config.STARTING_YEAR, config.ENDING_YEAR + 1):
        for m in range(config.STARTING_MONTH, config.ENDING_MONTH + 1):
            try:
                month = listenbrainz_spark.sql_context.read.parquet(
                    '{}/data/listenbrainz/{}/{}.parquet'.format(
                        config.HDFS_CLUSTER_URI, y, m))
                df = df.union(month) if df else month
            except AnalysisException as err:
                missing_parquets.append('{}-{}'.format(y, '{:02d}'.format(m)))
                logging.error(
                    'Cannot read parquet files from HDFS: {} \n {}'.format(
                        type(err).__name__, str(err)))
                continue
            except Exception as err:
                logging.error(
                    'An error occured while fetching \"/data/listenbrainz/{}/{}.parquet\": {} \n {}.              Aborting...'
                    .format(y, m,
                            type(err).__name__, str(err)),
                    exc_info=True)
                sys.exit(-1)

    if not df:
        raise SystemExit("Parquet files from {}-{} to {}-{} are empty".format(
            config.STARTING_YEAR, '{:02d}'.format(config.STARTING_MONTH),
            config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH)))

    logging.info('Registering Dataframe...')
    table = 'df_to_train_{}'.format(
        datetime.strftime(datetime.utcnow(), '%Y_%m_%d'))
    try:
        df.createOrReplaceTempView(table)
    except AnalysisException as err:
        logging.error(
            'Cannot register dataframe: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while registering dataframe: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    t = '{:.2f}'.format(time() - ti)
    logging.info(
        'Files fectched from HDFS and dataframe registered in {}s'.format(t))

    dest_path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz',
                             'recommendation-engine', 'dataframes')

    logging.info('Preparing users data and saving to HDFS...')
    try:
        t0 = time()
        users_df = prepare_user_data(table)
        users_df.write.format('parquet').save(dest_path + '/users_df.parquet',
                                              mode='overwrite')
    except QueryExecutionException as err:
        logging.error(
            'Failed to execute users query: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except AnalysisException as err:
        logging.error(
            'Failed to analyse users query plan: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except ParseException as err:
        logging.error(
            "Failed to parse SQL command: {} \n {}. Aborting...".format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while executing users query: {} \n {}. Aborting'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    users_df_time = '{:.2f}'.format((time() - t0) / 60)

    logging.info('Preparing recordings data and saving to HDFS...')
    try:
        t0 = time()
        recordings_df = prepare_recording_data(table)
        recordings_df.write.format('parquet').save(dest_path +
                                                   '/recordings_df.parquet',
                                                   mode='overwrite')
    except QueryExecutionException as err:
        logging.error(
            'Failed to execute recordings query: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except AnalysisException as err:
        logging.error(
            'Failed to analyse recordings query plan: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)))
        sys.exit(-1)
    except ParseException as err:
        logging.error(
            'Failed to parse SQL command: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while executing recordings query: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    recordings_df_time = '{:.2f}'.format((time() - t0) / 60)

    logging.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    try:
        t0 = time()
        listens_df = prepare_listen_data(table)
        playcounts_df = get_playcounts_data(listens_df, users_df,
                                            recordings_df)
        playcounts_df.write.format('parquet').save(dest_path +
                                                   '/playcounts_df.parquet',
                                                   mode='overwrite')
    except QueryExecutionException as err:
        logging.error(
            'Failed to execute playcounts query: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except AnalysisException as err:
        logging.error(
            'Failed to analyse playcounts query plan: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)))
        sys.exit(-1)
    except ParseException as err:
        logging.error(
            'Failed to parse SQL command: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error('An error occurred. {} \n {}. Aborting...'.format(
            type(err).__name__, str(err)),
                      exc_info=True)
        sys.exit(-1)
    playcounts_df_time = '{:.2f}'.format((time() - t0) / 60)

    total_time = '{:.2f}'.format((time() - ti) / 60)
    lb_dump_time_window = ('{}-{}'.format(
        config.STARTING_YEAR,
        '{:02d}'.format(config.STARTING_MONTH)), '{}-{}'.format(
            config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH)))
    date = datetime.utcnow().strftime('%Y-%m-%d')
    queries_html = 'Queries-{}-{}.html'.format(uuid.uuid4(), date)
    context = {
        'users_df_time': users_df_time,
        'recordings_df_time': recordings_df_time,
        'playcounts_df_time': playcounts_df_time,
        'lb_dump_time_window': lb_dump_time_window,
        'missing_parquets': missing_parquets,
        'total_time': total_time
    }
    utils.save_html(queries_html, context, 'queries.html')
예제 #7
0
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Train_Models')
    except AttributeError as err:
        logging.error(
            'Cannot initialize Spark Session: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while initializing Spark session: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    try:
        path = os.path.join('/', 'data', 'listenbrainz',
                            'recommendation-engine', 'dataframes',
                            'playcounts_df.parquet')
        playcounts_df = listenbrainz_spark.sql_context.read.parquet(
            config.HDFS_CLUSTER_URI + path)
    except AnalysisException as err:
        logging.error('Cannot read parquet file from HDFS: {} \n {}'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while fetching parquet: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    time_info = {}
    time_info['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60)

    t0 = time()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_info['preprocessing'] = '{:.2f}'.format((time() - t0) / 60)

    # Rdds that are used in model training iterative process are cached to improve performance.
    # Caching large files may cause Out of Memory exception.
    training_data.persist()
    validation_data.persist()
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()
    logging.info('Training models...')

    try:
        t0 = time()
        model, model_metadata, best_model_metadata = train(
            training_data, validation_data, num_validation, config.RANKS,
            config.LAMBDAS, config.ITERATIONS)
        models_training_time = '{:.2f}'.format((time() - t0) / 3600)
    except Py4JJavaError as err:
        logging.error('Unable to train models: {} \n {}. Aborting...'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while training models: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err), exc_info=True))
        sys.exit(-1)

    training_data.unpersist()
    validation_data.unpersist()

    logging.info('Saving model...')
    try:
        t0 = time()
        path = os.path.join('/', 'data', 'listenbrainz',
                            'recommendation-engine', 'best-model',
                            '{}'.format(best_model_metadata['model_id']))
        model.model.save(listenbrainz_spark.context,
                         config.HDFS_CLUSTER_URI + path)
        time_info['save_model'] = '{:.2f}'.format((time() - t0) / 60)
    except Py4JJavaError as err:
        logging.error("Unable to save model: {} \n {}. Aborting...".format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while saving model: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err), exc_info=True))
        sys.exit(-1)

    date = datetime.utcnow().strftime('%Y-%m-%d')
    model_html = 'Model-{}-{}.html'.format(uuid.uuid4(), date)
    context = {
        'time': time_info,
        'num_training': '{:,}'.format(num_training),
        'num_validation': '{:,}'.format(num_validation),
        'num_test': '{:,}'.format(num_test),
        'models': model_metadata,
        'best_model': best_model_metadata,
        'models_training_time': models_training_time,
        'total_time': '{:.2f}'.format((time() - ti) / 3600)
    }

    utils.save_html(model_html, context, 'model.html')
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'recommendation-metadata.json')
    with open(path, 'r') as f:
        recommendation_metadata = json.load(f)
        recommendation_metadata['best_model_id'] = best_model_metadata[
            'model_id']

    with open(path, 'w') as f:
        json.dump(recommendation_metadata, f)
예제 #8
0
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Recommendations')
    except AttributeError as err:
        logging.error(
            'Cannot initialize Spark Session: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while initializing Spark session: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    try:
        path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz',
                            'recommendation-engine', 'dataframes')
        playcounts_df = listenbrainz_spark.sql_context.read.parquet(
            path + '/playcounts_df.parquet')
        users_df = listenbrainz_spark.sql_context.read.parquet(
            path + '/users_df.parquet')
        recordings_df = listenbrainz_spark.sql_context.read.parquet(
            path + '/recordings_df.parquet')
    except AnalysisException as err:
        logging.error('Cannot read parquet files from HDFS: {} \n {}'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while fetching parquets: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    time_info = defaultdict(dict)
    time_info['dataframes'] = '{:.2f}'.format((time() - ti) / 60)
    try:
        users_df.createOrReplaceTempView('user')
        playcounts_df.createOrReplaceTempView('playcount')
    except AnalysisException as err:
        logging.error(
            'Cannot register dataframes: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while registering dataframes: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    t0 = time()
    all_recordings = recordings_df.select('recording_id')
    all_recordings.persist()
    all_recordings_count = '{:,}'.format(all_recordings.count())
    time_info['all_recordings'] = '{:.2f}'.format((time() - t0) / 60)

    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'recommendation-metadata.json')
    with open(path, 'r') as f:
        recommendation_metadata = json.load(f)
        best_model_id = recommendation_metadata['best_model_id']

    best_model_path = os.path.join('/', 'data', 'listenbrainz',
                                   'recommendation-engine', 'best-model',
                                   '{}'.format(best_model_id))

    logging.info('Loading model...')
    try:
        t0 = time()
        model = load_model(config.HDFS_CLUSTER_URI + best_model_path)
        time_info['load_model'] = '{:.2f}'.format((time() - t0) / 60)
    except Py4JJavaError as err:
        logging.error('Unable to load model: {} \n {}. Aborting...'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while loading model: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err), exc_info=True))
        sys.exit(-1)

    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'recommendation-metadata.json')
    ts = time()
    with open(path) as f:
        recommendation_metadata = json.load(f)
        recommendations = defaultdict(dict)
        for user_name in recommendation_metadata['user_name']:
            try:
                t0 = time()
                user_recommendations = recommend_user(user_name, model,
                                                      all_recordings,
                                                      recordings_df)
                user_recommendations['total-time'] = '{:.2f}'.format(
                    (time() - t0) / 60)
                logging.info(
                    'Recommendations for "{}" generated'.format(user_name))
                recommendations[user_name] = user_recommendations
            except TypeError as err:
                logging.error(
                    '{}: Invalid user name. User "{}" does not exist.'.format(
                        type(err).__name__, user_name))
            except Exception as err:
                logging.error(
                    'Recommendations for "{}" not generated.\n{}'.format(
                        user_name, str(err)),
                    exc_info=True)
    time_info['total_recommendation_time'] = '{:.2f}'.format(
        (time() - ts) / 3600)

    all_recordings.unpersist()

    date = datetime.utcnow().strftime('%Y-%m-%d')
    recommendation_html = 'Recommendation-{}-{}.html'.format(
        uuid.uuid4(), date)
    column = ('Track Name', 'Recording MSID', 'Artist Name', 'Artist MSID',
              'Release Name', 'Release MSID')
    context = {
        'recommendations': recommendations,
        'column': column,
        'total_time': '{:.2f}'.format((time() - ti) / 3600),
        'time': time_info,
        'best_model': best_model_id,
        'all_recordings_count': all_recordings_count,
    }
    utils.save_html(recommendation_html, context, 'recommend.html')