Exemplo n.º 1
0
    def process_json_listens(self, filename, data_dir, tmp_hdfs_path, append,
                             schema):
        """ Process a file containing listens from the ListenBrainz dump and add listens to
            appropriate dataframes.

            Args:
                filename (str): File name of JSON file.
                data_dir (str): Dir to save listens to in HDFS as parquet.
                tmp_hdfs_path (str): HDFS path where listens JSON has been uploaded.
                append (bool): If true append to end of parquet rather than write.
                schema: Schema of the listens
        """
        start_time = time.monotonic()
        df = utils.read_json(tmp_hdfs_path, schema=schema)

        if filename.split('/')[-1] == 'invalid.json':
            dest_path = os.path.join(data_dir, 'invalid.parquet')
        else:
            year = filename.split('/')[-2]
            month = filename.split('/')[-1][0:-5]
            dest_path = os.path.join(data_dir, year,
                                     '{}.parquet'.format(str(month)))

        if append and utils.path_exists(dest_path):
            utils.save_parquet(df, dest_path, mode="append")
        else:
            utils.save_parquet(df, dest_path, mode="overwrite")

        logger.info("Uploading to {}...".format(dest_path))
        logger.info(
            "File processed in {:.2f} seconds!".format(time.monotonic() -
                                                       start_time))
Exemplo n.º 2
0
    def process_json_listens(self, filename, data_dir, tmp_hdfs_path, schema):
        """ Process a file containing listens from the ListenBrainz dump and add listens to
            appropriate dataframes.

            Args:
                filename (str): File name of JSON file.
                data_dir (str): Dir to save listens to in HDFS as parquet.
                tmp_HDFS_path (str): HDFS path where listens JSON has been uploaded.
        """
        start_time = time.time()
        df = utils.read_json(tmp_hdfs_path, schema=schema)
        current_app.logger.info("Processing {} listens...".format(df.count()))

        if filename.split('/')[-1] == 'invalid.json':
            dest_path = os.path.join(data_dir, 'invalid.parquet')
        else:
            year = filename.split('/')[-2]
            month = filename.split('/')[-1][0:-5]
            dest_path = os.path.join(data_dir, year,
                                     '{}.parquet'.format(str(month)))

        current_app.logger.info("Uploading to {}...".format(dest_path))
        utils.save_parquet(df, dest_path)
        current_app.logger.info(
            "File processed in {:.2f} seconds!".format(time.time() -
                                                       start_time))
Exemplo n.º 3
0
    def process_json(self, _, dest_path, tmp_hdfs_path, __, schema):
        """ Read JSON from HDFS as a dataframe and upload to
            HDFS as a parquet.

            Args:
                dest_path (str): HDFS path to upload JSON as parquet.
                tmp_hdfs_path (str): HDFS path where JSON has been uploaded.
        """
        start_time = time.monotonic()
        df = utils.read_json(tmp_hdfs_path, schema=schema)
        logger.info("Processing {} rows...".format(df.count()))

        logger.info("Uploading to {}...".format(dest_path))
        utils.save_parquet(df, dest_path)
        logger.info("File processed in {:.2f} seconds!".format(time.monotonic() - start_time))