Python construct_hbase_catalog_from_flatten_schema示例

编程语言: Python

命名空间/包名称: fink_broker.hbaseUtils

方法/功能: construct_hbase_catalog_from_flatten_schema

hotexamples.com的示例: 6

Python construct_hbase_catalog_from_flatten_schema - 已找到6个示例。这些是从开源项目中提取的最受好评的fink_broker.hbaseUtils.construct_hbase_catalog_from_flatten_schema现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： distributionUtils.py 项目： saucam/fink-broker

def update_status_in_hbase(df: DataFrame, database_name: str, rowkey: str,
                           offsetfile: str, timestamp: int):
    """Update the status column in Hbase

    Parameters
    ----------
    df: DataFrame
        A Spark DataFrame created after reading the database (HBase)
    database_name: str
        Name of the database
    rowkey: str
        Name of the rowkey in the HBase catalog
    offsetfile: str
        the path of offset file for distribution
    timestamp: int
        timestamp till which science db has been scanned and distributed
    ----------
    """
    df = df.select(rowkey, "status")
    df = df.withColumn("status", lit("distributed"))

    update_catalog = construct_hbase_catalog_from_flatten_schema(
        df.schema, database_name, rowkey)

    df.write\
      .option("catalog", update_catalog)\
      .format("org.apache.spark.sql.execution.datasources.hbase")\
      .save()

    # write offset(timestamp) to file
    with open(offsetfile, 'w') as f:
        string = "distributed till, {}".format(timestamp)
        f.write(string)

示例#2

显示文件

文件： daily_stats.py 项目： tallamjr/fink-broker

def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="statistics_{}".format(args.night),
        shuffle_partitions=2
    )

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Statistics for {}/{}/{}'.format(year, month, day))

    input_raw = '{}/raw/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)
    input_science = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)

    df_raw = spark.read.format('parquet').load(input_raw)
    df_sci = spark.read.format('parquet').load(input_science)

    df_sci = df_sci.cache()

    # Number of alerts
    n_raw_alert = df_raw.count()
    n_sci_alert = df_sci.count()

    out_dic = {}
    out_dic['raw'] = n_raw_alert
    out_dic['sci'] = n_sci_alert

    # matches with SIMBAD
    n_simbad = df_sci.select('cdsxmatch')\
        .filter(df_sci['cdsxmatch'] != 'Unknown')\
        .count()

    out_dic['simbad_tot'] = n_simbad

    # Alerts with a close-by candidate host-galaxy
    list_simbad_galaxies = [
        "galaxy",
        "Galaxy",
        "EmG",
        "Seyfert",
        "Seyfert_1",
        "Seyfert_2",
        "BlueCompG",
        "StarburstG",
        "LSB_G",
        "HII_G",
        "High_z_G",
        "GinPair",
        "GinGroup",
        "BClG",
        "GinCl",
        "PartofG",
    ]

    n_simbad_gal = df_sci.select('cdsxmatch')\
        .filter(df_sci['cdsxmatch'].isin(list_simbad_galaxies))\
        .count()

    out_dic['simbad_gal'] = n_simbad_gal

    df_class = df_sci.withColumn(
        'class',
        extract_fink_classification(
            df_sci['cdsxmatch'],
            df_sci['roid'],
            df_sci['mulens'],
            df_sci['snn_snia_vs_nonia'],
            df_sci['snn_sn_vs_all'],
            df_sci['rf_snia_vs_nonia'],
            df_sci['candidate.ndethist'],
            df_sci['candidate.drb'],
            df_sci['candidate.classtar'],
            df_sci['candidate.jd'],
            df_sci['candidate.jdstarthist'],
            df_sci['rf_kn_vs_nonkn'],
            df_sci['tracklet']
        )
    )

    out_class = df_class.groupBy('class').count().collect()
    out_class_ = [o.asDict() for o in out_class]
    out_class_ = [list(o.values()) for o in out_class_]
    for kv in out_class_:
        out_dic[kv[0]] = kv[1]

    # Number of fields
    n_field = df_sci.select('candidate.field').distinct().count()

    out_dic['fields'] = n_field

    # number of measurements per band
    n_g = df_sci.select('candidate.fid').filter('fid == 1').count()
    n_r = df_sci.select('candidate.fid').filter('fid == 2').count()

    out_dic['n_g'] = n_g
    out_dic['n_r'] = n_r

    # Number of exposures
    n_exp = df_sci.select('candidate.jd').distinct().count()

    out_dic['exposures'] = n_exp

    out_dic['night'] = 'ztf_{}'.format(args.night)

    # make a Spark DataFrame
    pdf = pd.DataFrame([out_dic])
    df_hbase = spark.createDataFrame(pdf)

    # rowkey is the night YYYYMMDD
    index_row_key_name = 'night'

    # Columns to use
    cols_basic = [
        'raw',
        'sci',
        'night',
        'n_g',
        'n_r',
        'exposures',
        'fields'
    ]

    cols_class_ = np.transpose(out_class_)[0]
    cols_class = np.concatenate((cols_class_, ['simbad_tot', 'simbad_gal']))

    # column families
    cf = {i: 'basic' for i in df_hbase.select(*cols_basic).columns}
    cf.update({i: 'class' for i in df_hbase.select(*cols_class).columns})

    # construct the time catalog
    hbcatalog_index = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema,
        'statistics_class',
        rowkeyname=index_row_key_name,
        cf=cf
    )

    # Push index table
    df_hbase.write\
        .options(catalog=hbcatalog_index, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

    # Construct the schema row - inplace replacement
    schema_row_key_name = 'schema_version'
    df_hbase = df_hbase.withColumnRenamed(
        index_row_key_name,
        schema_row_key_name
    )

    df_hbase_schema = construct_schema_row(
        df_hbase,
        rowkeyname=schema_row_key_name,
        version='schema_{}_{}'.format(fbvsn, fsvsn))

    # construct the hbase catalog for the schema
    hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema(
        df_hbase_schema.schema,
        'statistics_class',
        rowkeyname=schema_row_key_name,
        cf=cf)

    # Push the data using the shc connector
    df_hbase_schema.write\
        .options(catalog=hbcatalog_index_schema, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

示例#3

显示文件

文件： index_archival.py 项目： tallamjr/fink-broker

def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="index_archival_{}_{}".format(args.index_table, args.night),
        shuffle_partitions=2
    )

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix,
        args.night[:4],
        args.night[4:6],
        args.night[6:8]
    )
    df = load_parquet_files(path)

    # construct the index view
    index_row_key_name = args.index_table
    columns = index_row_key_name.split('_')
    names = [col(i) for i in columns]
    index_name = '.' + columns[0]

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Load column names to use in the science portal
    cols_i, cols_d, cols_b = load_science_portal_column_names()

    # Assign each column to a specific column family
    cf = assign_column_family_names(df, cols_i, cols_d, cols_b)

    # Restrict the input DataFrame to the subset of wanted columns.
    if 'upper' in args.index_table:
        df = df.select(
            'objectId',
            'prv_candidates.jd',
            'prv_candidates.fid',
            'prv_candidates.magpsf',
            'prv_candidates.sigmapsf',
            'prv_candidates.diffmaglim'
        )
    else:
        df = df.select(cols_i + cols_d + cols_b)

    # Create and attach the rowkey
    df, _ = attach_rowkey(df)

    common_cols = [
        'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr',
        'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb',
        'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag',
        'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim',
        'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos',
        'cdsxmatch',
        'roid',
        'mulens',
        'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia',
        'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet'
    ]

    if columns[0].startswith('pixel'):
        nside = int(columns[0].split('pixel')[1])

        df_index = df.withColumn(
            columns[0],
            ang2pix(
                df['ra'],
                df['dec'],
                lit(nside)
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + ['objectId']
        )
    elif columns[0] == 'class':
        df_index = df.withColumn(
            'class',
            extract_fink_classification(
                df['cdsxmatch'],
                df['roid'],
                df['mulens'],
                df['snn_snia_vs_nonia'],
                df['snn_sn_vs_all'],
                df['rf_snia_vs_nonia'],
                df['ndethist'],
                df['drb'],
                df['classtar'],
                df['jd'],
                df['jdstarthist'],
                df['rf_kn_vs_nonkn'],
                df['tracklet']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )
    elif columns[0] == 'ssnamenr':
        # Flag only objects with likely counterpart in MPC
        df_index = df\
            .filter(df['roid'] == 3)\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'tracklet':
        # For data < 2021-08-10, no tracklet means ''
        # For data >= 2021-08-10, no tracklet means 'null'
        df_index = df\
            .filter(df['tracklet'] != 'null')\
            .filter(df['tracklet'] != '')\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'upper':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only upper limits
        df_index = df_ex.filter(~df_ex['magpsf'].isNotNull())
        # drop NaN columns
        df_index = df_index.drop(*['magpsf', 'sigmapsf'])
    elif columns[0] == 'uppervalid':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only valid measurements from the history
        df_index = df_ex.filter(df_ex['magpsf'].isNotNull())
    elif columns[0] == 'tns':
        with open('{}/tns_marker.txt'.format(args.tns_folder)) as f:
            tns_marker = f.read().replace('\n', '')

        pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker)

        # Filter TNS confirmed data
        f1 = ~pdf_tns['type'].isna()
        pdf_tns_filt = pdf_tns[f1]

        pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt)

        @pandas_udf(StringType(), PandasUDFType.SCALAR)
        def crossmatch_with_tns(objectid, ra, dec):
            # TNS
            pdf = pdf_tns_filt_b.value
            ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type']

            # create catalogs
            catalog_ztf = SkyCoord(
                ra=np.array(ra, dtype=np.float) * u.degree,
                dec=np.array(dec, dtype=np.float) * u.degree
            )
            catalog_tns = SkyCoord(
                ra=np.array(ra2, dtype=np.float) * u.degree,
                dec=np.array(dec2, dtype=np.float) * u.degree
            )

            # cross-match
            idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf)

            sub_pdf = pd.DataFrame({
                'objectId': objectid.values[idx],
                'ra': ra.values[idx],
                'dec': dec.values[idx],
            })

            # cross-match
            idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns)

            # set separation length
            sep_constraint2 = d2d2.degree < 1.5 / 3600

            sub_pdf['TNS'] = [''] * len(sub_pdf)
            sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]]

            to_return = objectid.apply(
                lambda x: '' if x not in sub_pdf['objectId'].values
                else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0]
            )

            return to_return

        df = df.withColumn(
            'tns',
            crossmatch_with_tns(
                df['objectId'],
                df['ra'],
                df['dec']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols + ['tns']
        ).cache()
        df_index = df.filter(df['tns'] != '').drop('tns')
        # trigger the cache - not the cache might be a killer for LSST...
        n = df_index.count()
        print('TNS objects: {}'.format(n))
    else:
        df_index = df.select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )

    # construct the time catalog
    hbcatalog_index = construct_hbase_catalog_from_flatten_schema(
        df_index.schema,
        args.science_db_name + index_name,
        rowkeyname=index_row_key_name,
        cf=cf
    )

    # Push index table
    df_index.write\
        .options(catalog=hbcatalog_index, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

    # Construct the schema row - inplace replacement
    schema_row_key_name = 'schema_version'
    df_index = df_index.withColumnRenamed(
        index_row_key_name,
        schema_row_key_name
    )

    df_index_schema = construct_schema_row(
        df_index,
        rowkeyname=schema_row_key_name,
        version='schema_{}_{}'.format(fbvsn, fsvsn))

    # construct the hbase catalog for the schema
    hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema(
        df_index_schema.schema,
        args.science_db_name + index_name,
        rowkeyname=schema_row_key_name,
        cf=cf)

    # Push the data using the shc connector
    df_index_schema.write\
        .options(catalog=hbcatalog_index_schema, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

示例#4

显示文件

def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="science_archival_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix,
                                                       args.night[:4],
                                                       args.night[4:6],
                                                       args.night[6:8])
    df = load_parquet_files(path)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Load column names to use in the science portal
    cols_i, cols_d, cols_b = load_science_portal_column_names()

    # Assign each column to a specific column family
    cf = assign_column_family_names(df, cols_i, cols_d, cols_b)

    # Restrict the input DataFrame to the subset of wanted columns.
    df = df.select(cols_i + cols_d + cols_b)

    # Create and attach the rowkey
    df, row_key_name = attach_rowkey(df)

    # construct the hbase catalog
    hbcatalog = construct_hbase_catalog_from_flatten_schema(
        df.schema, args.science_db_name, rowkeyname=row_key_name, cf=cf)

    # Save the catalog on disk (local)
    with open(args.science_db_catalog, 'w') as json_file:
        json.dump(hbcatalog, json_file)

    if args.save_science_db_catalog_only:
        # Print for visual inspection
        print(hbcatalog)
    else:
        # Push the data using the shc connector
        df.write\
            .options(catalog=hbcatalog, newtable=50)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

        # Construct the schema row - inplace replacement
        schema_row_key_name = 'schema_version'
        df = df.withColumnRenamed(row_key_name, schema_row_key_name)

        df_schema = construct_schema_row(df,
                                         rowkeyname=schema_row_key_name,
                                         version='schema_{}_{}'.format(
                                             fbvsn, fsvsn))

        # construct the hbase catalog for the schema
        hbcatalog_schema = construct_hbase_catalog_from_flatten_schema(
            df_schema.schema,
            args.science_db_name,
            rowkeyname=schema_row_key_name,
            cf=cf)

        # Save the catalog on disk (local)
        catname = args.science_db_catalog.replace('.json', '_schema_row.json')
        with open(catname, 'w') as json_file:
            json.dump(hbcatalog_schema, json_file)

        # Push the data using the shc connector
        df_schema.write\
            .options(catalog=hbcatalog_schema, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

示例#5

显示文件

文件： raw2science.py 项目： cAbhi15/fink-broker

def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Not very satisfactory... The problem is that latesfirst = false is
    # required to create new HBase table (i.e. all the time in the CI).
    # If you have a better idea, let me know!
    if "travis" in args.science_db_name:
        latesfirst = False
    else:
        latesfirst = True

    df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*",
                                 latesfirst)

    # Apply level one filters
    logger.info(filter_levelone_names)
    df = apply_user_defined_filters(df, filter_levelone_names)

    # Apply level one processors
    logger.info(processor_levelone_names)
    df = apply_user_defined_processors(df, processor_levelone_names)

    # Select alert data + timestamp + added value from processors
    new_colnames = ["decoded.*", "cast(timestamp as string) as timestamp"]
    for i in processor_levelone_names:
        new_colnames.append(i)

    df = df.selectExpr(new_colnames)

    df_hbase = flattenstruct(df, "candidate")
    df_hbase = flattenstruct(df_hbase, "cutoutScience")
    df_hbase = flattenstruct(df_hbase, "cutoutTemplate")
    df_hbase = flattenstruct(df_hbase, "cutoutDifference")
    df_hbase = explodearrayofstruct(df_hbase, "prv_candidates")

    # Create a status column for distribution
    df_hbase = df_hbase.withColumn("status", lit("dbUpdate"))

    # Save the catalog on disk for later usage
    catalog = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema, args.science_db_name, "objectId")

    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog, 'w') as json_file:
        json.dump(catalog, json_file)

    def write_to_hbase_and_monitor(df: DataFrame, epochid: int,
                                   hbcatalog: str):
        """Write data into HBase.

        The purpose of this function is to write data to HBase using
        Structured Streaming tools such as foreachBatch.

        Parameters
        ----------
        df : DataFrame
            Input micro-batch DataFrame.
        epochid : int
            ID of the micro-batch
        hbcatalog : str
            HBase catalog describing the data

        """
        # If the table does not exist, one needs to specify
        # the number of zones to use (must be greater than 3).
        # TODO: remove this harcoded parameter.
        df.write\
            .options(catalog=hbcatalog, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

    # Query to push data into HBase
    countquery = df_hbase\
        .writeStream\
        .outputMode("append")\
        .option("checkpointLocation", args.checkpointpath_sci)\
        .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\
        .start()

    # Query to group objects by type according to SIMBAD
    # Do it every 30 seconds
    groupedquery_started = False
    if "cross_match_alerts_per_batch" in processor_levelone_names:
        df_group = df.groupBy("cross_match_alerts_per_batch").count()
        groupquery = df_group\
            .writeStream\
            .outputMode("complete") \
            .foreachBatch(write_to_csv)\
            .trigger(processingTime='30 seconds'.format(args.tinterval))\
            .start()
        groupedquery_started = True

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        if groupedquery_started:
            groupquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()

示例#6

显示文件

文件： raw2science.py 项目： marcol480/fink-broker

def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Grab the running Spark Session,
    # otherwise create it.
    spark = init_sparksession(
        name="buildSciDB", shuffle_partitions=2, log_level="ERROR")

    # FIXME!
    if "travis" in args.science_db_name:
        latesfirst = False
    else:
        latesfirst = True

    df = connect_to_raw_database(
        args.rawdatapath, args.rawdatapath + "/*", latesfirst)

    # Apply filters and keep only good alerts
    df_filt = df.withColumn(
        "toKeep",
        keep_alert_based_on(
            col("decoded.candidate.nbad"),
            col("decoded.candidate.rb"),
            col("decoded.candidate.magdiff")
        )
    ).filter("toKeep == true")

    # for good alerts, perform a cross-match with SIMBAD,
    # and return the types of the objects (Star, AGN, Unknown, etc.)
    df_type = df_filt.withColumn(
        "simbadType",
        cross_match_alerts_per_batch(
            col("decoded.objectId"),
            col("decoded.candidate.ra"),
            col("decoded.candidate.dec")
        )
    ).selectExpr(
        "decoded.*", "cast(timestamp as string) as timestamp", "simbadType")

    df_hbase = flattenstruct(df_type, "candidate")
    df_hbase = flattenstruct(df_hbase, "cutoutScience")
    df_hbase = flattenstruct(df_hbase, "cutoutTemplate")
    df_hbase = flattenstruct(df_hbase, "cutoutDifference")
    df_hbase = explodearrayofstruct(df_hbase, "prv_candidates")

    # Create a status column for distribution
    df_hbase = df_hbase.withColumn("status", lit("dbUpdate"))

    # Save the catalog on disk for later usage
    catalog = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema, args.science_db_name, "objectId")

    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog, 'w') as json_file:
        json.dump(catalog, json_file)

    def write_to_hbase_and_monitor(
            df: DataFrame, epochid: int, hbcatalog: str):
        """Write data into HBase.

        The purpose of this function is to write data to HBase using
        Structured Streaming tools such as foreachBatch.

        Parameters
        ----------
        df : DataFrame
            Input micro-batch DataFrame.
        epochid : int
            ID of the micro-batch
        hbcatalog : str
            HBase catalog describing the data

        """
        # If the table does not exist, one needs to specify
        # the number of zones to use (must be greater than 3).
        # TODO: remove this harcoded parameter.
        df.write\
            .options(catalog=hbcatalog, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

    # Query to push data into HBase
    countquery = df_hbase\
        .writeStream\
        .outputMode("append")\
        .option("checkpointLocation", args.checkpointpath_sci)\
        .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\
        .start()

    # Query to group objects by type according to SIMBAD
    # Do it every 30 seconds
    df_group = df_type.groupBy("simbadType").count()
    groupquery = df_group\
        .writeStream\
        .outputMode("complete") \
        .foreachBatch(write_to_csv)\
        .trigger(processingTime='30 seconds'.format(args.tinterval))\
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        groupquery.stop()
        print("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()