es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id(
        c_options)

    # Setup SparkContext
    sc = SparkContext(appName="getimages-" + ingestion_id + dev_release_suffix)
    conf = SparkConf()
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

    # Setup HBase managers
    join_columns_list = [':'.join(x) for x in fields_list]
    get_create_table(c_options.tab_sha1_infos_name, c_options)
    hbase_fullhost = c_options.hbase_host + ':' + str(c_options.hbase_port)
    hbase_man_sha1infos_join = HbaseManager(sc,
                                            conf,
                                            hbase_fullhost,
                                            c_options.tab_sha1_infos_name,
                                            columns_list=join_columns_list)
    hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost,
                                           c_options.tab_sha1_infos_name)
    get_create_table(c_options.tab_update_name, c_options)
    hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost,
                                        c_options.tab_update_name)

    # Setup ES manager
    es_man = ES(sc, conf, c_options.es_index, c_options.es_domain,
                c_options.es_host, c_options.es_port, c_options.es_user,
                c_options.es_pass)
    es_man.set_output_json()
    es_man.set_read_metadata()
Exemplo n.º 2
0
        print inst
        parser.print_help()
    
    es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id(c_options)


    # Setup SparkContext    
    sc = SparkContext(appName="extract-features-"+ingestion_id+job_suffix)
    sc.addPyFile('hdfs://memex/user/skaraman/extract-features/network.py')
    sc.addPyFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.py')
    sc.addFile('hdfs://memex/user/skaraman/extract-features/imagenet_mean.npy')
    sc.addFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.npy')
    conf = SparkConf()
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)
    
    # Setup HBase managers
    # just to be sure we will be able to write out to the table
    get_create_table(c_options.tab_sha1_infos_name, c_options)
    get_create_table(c_options.tab_update_name, c_options)
    # hbase managers
    hbase_fullhost = c_options.hbase_host+':'+str(c_options.hbase_port)
    hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name)
    hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_update_name)
    
    # Run extraction
    print "[START] Starting extracting features for ingestion {}".format(ingestion_id)
    run_extraction(hbase_man_sha1infos_out, hbase_man_update_out, ingestion_id, c_options)
    print "[DONE] Extracted features for ingestion {} in {}s.".format(ingestion_id, time.time() - start_time)


def check_updates(hbase_man_in):
    in_rdd = hbase_man_in.read_hbase_table()
    nb_updates = in_rdd.count()
    updates_notprocessed_rdd = in_rdd.filter(update_not_processed)
    nb_updates_notprocessed = updates_notprocessed_rdd.count()
    print('We have {} updates not processed out of {}.'.format(
        nb_updates_notprocessed, nb_updates))
    print("[check_updates] DONE.")


if __name__ == '__main__':
    from hbase_manager import HbaseManager
    #job_conf = json.load(open("job_conf.json","rt"))
    job_conf = json.load(open("job_conf_dev.json", "rt"))
    print job_conf
    tab_updates_name = job_conf["tab_updates_name"]
    hbase_host = job_conf["hbase_host"]
    sc = SparkContext(appName='check_updates_' + tab_updates_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    # read rows starting from 'index_update_' in 'tab_updates_name'
    hbase_man_in = HbaseManager(sc,
                                conf,
                                hbase_host,
                                tab_updates_name,
                                row_start='index_update_',
                                row_end='index_update_~')
    check_updates(hbase_man_in)
    print query
    es_rdd = es_man.es2rdd(query)
    images_hb_rdd = es_rdd.flatMap(lambda x: create_images_tuple(x))
    hbase_man.rdd2hbase(images_hb_rdd)


if __name__ == '__main__':
    # Read job_conf
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    # Set parameters job_conf
    tab_name = job_conf["tab_name"]
    hbase_host = job_conf["hbase_host"]
    new_crawler = job_conf["new_crawler"]
    es_index = job_conf["es_index"]
    es_domain = job_conf["es_domain"]
    es_host = job_conf["es_host"]
    es_port = job_conf["es_port"]
    es_user = job_conf["es_user"]
    es_pass = job_conf["es_pass"]
    # Update fields based on new_crawler value
    set_fields_filter(new_crawler)
    # Start job
    sc = SparkContext(appName=tab_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    es_man = ES(sc, conf, es_index, es_domain, es_host, es_port, es_user,
                es_pass)
    hbase_man = HbaseManager(sc, conf, hbase_host, tab_name)
    move_data(es_man, hbase_man)
    return kv


def count_dups(hbase_man_timestamp, outfilename):
    in_rdd = hbase_man_timestamp.read_hbase_table()
    count_rdd = in_rdd.map(lambda x: split_ts_cdrid_rowkey(x))
    summary_rdd = count_rdd.reduceByKey(lambda x, y: x + y).map(
        lambda (x, y): (y, x)).sortByKey(
            0,
            1).map(lambda (x, y): (y, x)).map(lambda x: x[0] + ":" + str(x[1]))
    print summary_rdd.count()
    summary_rdd.saveAsTextFile(outfilename)


if __name__ == '__main__':
    # Read job_conf
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf

    # Set parameters job_conf
    tab_timestamp_name = job_conf["tab_timestamp_name"]
    hbase_host = job_conf["hbase_host"]
    outfilename = job_conf["outfilename"]

    # Start job
    sc = SparkContext(appName=tab_timestamp_name + "_count_dups")
    conf = SparkConf()
    hbase_man_timestamp = HbaseManager(sc, conf, hbase_host,
                                       tab_timestamp_name)
    count_dups(hbase_man_timestamp, outfilename)
Exemplo n.º 6
0
    # escorts_images_sha1_infos_from_ts_subsampled_newformat => ht_images_infos_merged_subsampled
    # discard ad:*. ext:sbcmdline (at least for real transform). just do not put them in mappings
    # mappings should be an array of arrays like:
    # ["ext:dlib*", "data:dlib*"]
    # ["ext:sbpycaffe*", "data:sbpycaffe*"]
    # ["info:s3_url", "data:s3_url"]
    # Could be a parameter in conf
    HAPPYBASE_HOST = '10.108.16.137'

    # TODO: should we also transform update table?

    # Try to create "tab_name_out"
    HBASE_TIMEOUT = None
    NB_THREADS = 1
    POOL = happybase.ConnectionPool(size=NB_THREADS,
                                    host=HAPPYBASE_HOST,
                                    timeout=HBASE_TIMEOUT)
    with POOL.connection() as CONN:
        get_create_table(TAB_NAME_OUT, CONN, TAB_OUT_FAMILIES)

    # Setup spark job
    SC = SparkContext(appName='transform_' + TAB_NAME_IN + '_to_' +
                      TAB_NAME_OUT)
    SC.setLogLevel("ERROR")
    CONF = SparkConf()
    HBASE_MAN_IN = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_IN)
    HBASE_MAN_OUT = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_OUT)
    transform_table()

    print("Transformation completed.")
Exemplo n.º 7
0
            max_images_reduce)
    return c


def check_cdrids(hbase_man_in):
    in_rdd = hbase_man_in.read_hbase_table()
    nb_cdrids = in_rdd.count()
    print('We have {} images.'.format(nb_cdrids))
    cdrids_wsha1valid_rdd = in_rdd.filter(check_wsha1valid_cdrid)
    nb_cdrids_wsha1valid = cdrids_wsha1valid_rdd.count()
    print('We have {} valid images with SHA1 out of {} total images.'.format(
        nb_cdrids_wsha1valid, nb_cdrids))
    sha1_rdd = cdrids_wsha1valid_rdd.flatMap(
        cdrid_key_to_sha1_key).reduceByKey(reduce_sha1_infos_discarding)
    nb_sha1valid = sha1_rdd.count()
    print('We have {} valid unique SHA1 images.'.format(nb_sha1valid))
    print("[check_cdrids] DONE.")


if __name__ == '__main__':
    from hbase_manager import HbaseManager
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    tab_cdrid_name = job_conf["tab_cdrid_name"]
    hbase_host = job_conf["hbase_host"]
    sc = SparkContext(appName='check_cdrids_' + tab_cdrid_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_cdrid_name)
    check_cdrids(hbase_man_in)
Exemplo n.º 8
0
    return out


def fill_missing_sha1(sc, hbase_man, hbase_man_missing):
    in_rdd = hbase_man.read_hbase_table()
    out_rdd = in_rdd.flatMap(lambda x: has_sha1(x))
    hbase_man_missing.rdd2hbase(out_rdd)


if __name__ == '__main__':
    from hbase_manager import HbaseManager
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    tab_name = job_conf["tab_name"]
    tab_missing_name = job_conf["tab_missing_name"]
    hbase_host = job_conf["hbase_host"]
    in_columns_list = [
        "info:obj_stored_url", "info:crawl_data.image_id", "info:sha1"
    ]
    sc = SparkContext(appName='missing-sha1_' + tab_name + '_to_' +
                      tab_missing_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    hbase_man = HbaseManager(sc,
                             conf,
                             hbase_host,
                             tab_name,
                             columns_list=in_columns_list)
    hbase_man_missing = HbaseManager(sc, conf, hbase_host, tab_missing_name)
    fill_missing_sha1(sc, hbase_man, hbase_man_missing)
    return tup_list


def fill_sim(sc, hbase_man_in, hbase_man_out):
    in_rdd = hbase_man_in.read_hbase_table()
    sim_images_hb_rdd = in_rdd.flatMap(lambda x: create_sim_images_tuple(x))
    hbase_man_out.rdd2hbase(sim_images_hb_rdd)
    print "[fill_sim] Done."


if __name__ == '__main__':
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    tab_name_in = job_conf["tab_name_in"]
    tab_name_out = job_conf["tab_name_out"]
    hbase_host = job_conf["hbase_host"]
    sc = SparkContext(appName=tab_name_in + '_to_' + tab_name_out)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    in_columns_list = [
        "meta:sha1", "meta:columbia_near_dups_sha1",
        "meta:columbia_near_dups_sha1_dist"
    ]
    hbase_man_in = HbaseManager(sc,
                                conf,
                                hbase_host,
                                tab_name_in,
                                columns_list=in_columns_list)
    hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_name_out)
    fill_sim(sc, hbase_man_in, hbase_man_out)
Exemplo n.º 10
0
    return [(key, [key, "info", "precomp_sim", "True"])]


def mark_precomp_sim(hbase_man_in, hbase_man_out):
    in_rdd = hbase_man_in.read_hbase_table()
    existing_sims = in_rdd.keys()
    existing_sims_count = existing_sims.count()
    print("existing_sims count: {}".format(existing_sims_count))
    sample_existing_sims = existing_sims.first()
    print("existing_sims first: {}".format(sample_existing_sims))
    out_rdd = existing_sims.flatMap(prepare_mark_precomp)
    sample_out_rdd = out_rdd.take(5)
    print("out_rdd sample: {}".format(sample_out_rdd))
    hbase_man_out.rdd2hbase(out_rdd)


if __name__ == '__main__':
    from hbase_manager import HbaseManager
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    tab_sim_name = job_conf["tab_sim"]
    tab_sha1_infos_name = job_conf["tab_sha1_infos"]
    hbase_host = job_conf["hbase_host"]
    sc = SparkContext(appName='mark-precomp-sim_from_' + tab_sim_name + '_to' +
                      tab_sha1_infos_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_sim_name)
    hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_sha1_infos_name)
    mark_precomp_sim(hbase_man_in, hbase_man_out)
Exemplo n.º 11
0
    out = [(x[0], '{'+','.join(['"'+field[0]+':'+field[1]+'":["'+'","'.join(x[1][field[0]+":"+field[1]])+'"]' for field in fields_list])+'}')]
    # print out when more than one cdr id for a sha1
    #if len(x[1][fields_list[1][0]+":"+fields_list[1][1]])>1:
    #    print out
    return out

def fill_sha1_infos(sc, hbase_man_in, outfile_name):
    in_rdd = hbase_man_in.read_hbase_table()
    tmp_rdd = in_rdd.flatMap(lambda x: to_sha1_key(x)).reduceByKey(reduce_sha1_infos)
    # array not compatible with SequenceFile output
    out_rdd = tmp_rdd.flatMap(lambda x: split_sha1_kv_json(x))
    out_rdd.saveAsSequenceFile(outfile_name)

if __name__ == '__main__':
    from hbase_manager import HbaseManager
    job_conf = json.load(open("job_conf.json","rt"))
    print job_conf
    tab_cdrid_name = job_conf["tab_cdrid_name"]
    hbase_host = job_conf["hbase_host"]
    outfile_name = job_conf["outfile_name_hdfs"]
    sc = SparkContext(appName='sha1_infos_from_'+tab_cdrid_name+'_to_hdfs_'+outfile_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    # Get only these column from table tab_cdrid_name
    #"info:sha1"
    #"info:obj_stored_url"
    #"info:obj_parent"
    in_columns_list = ["info:sha1", "info:obj_stored_url", "info:obj_parent"]
    hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_cdrid_name, columns_list=in_columns_list)
    fill_sha1_infos(sc, hbase_man_in, outfile_name)
Exemplo n.º 12
0
import sys
print(sys.version)

import json
from pyspark import SparkContext, SparkConf
from hbase_manager import HbaseManager

fields_list = [("info","all_cdr_ids"), ("info","s3_url"), ("info","all_parent_ids"), ("info","image_discarded"), ("info","cu_feat_id")]
join_columns_list = [':'.join(x) for x in fields_list]

## MAIN
if __name__ == '__main__':
    
    # Read job_conf
    job_conf = json.load(open("job_conf.json","rt"))
    print job_conf
    sc = SparkContext(appName="test_read_hbase")
    conf = SparkConf()
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

    # HBase Conf
    hbase_host = job_conf["hbase_host"]
    tab_name = job_conf["tab_name"]
    hbase_man = HbaseManager(sc, conf, hbase_host, tab_name, columns_list=join_columns_list)

    # Run test
    in_rdd = hbase_man.read_hbase_table()
    print in_rdd.first()
Exemplo n.º 13
0
    nb_images_to_dl = images_to_dl_rdd.count()
    nb_partitions = int(np.ceil(nb_images_to_dl / nb_images_by_partition))
    print(
        'We have {} images, we want a maximum of {} images by partition. So we will partition in {} partitions.'
        .format(nb_images_to_dl, nb_images_by_partition, nb_partitions))
    out_rdd = images_to_dl_rdd.partitionBy(nb_partitions).flatMap(
        lambda x: download_image(x))
    hbase_man_out.rdd2hbase(out_rdd)
    nb_images_dled = out_rdd.count()
    print('We have downloaded {} images.'.format(nb_images_dled))
    print("[fill_binary_image] DONE.")


if __name__ == '__main__':
    from hbase_manager import HbaseManager
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    tab_sha1_name = job_conf["tab_sha1_name"]
    hbase_host = job_conf["hbase_host"]
    nb_images_by_partition = job_conf["nb_images_by_partition"]
    sc = SparkContext(appName='dl_images_' + tab_sha1_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    hbase_man_in = HbaseManager(sc,
                                conf,
                                hbase_host,
                                tab_sha1_name,
                                columns=["info:image", "info:s3_url"])
    hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_sha1_name)
    fill_binary_image(hbase_man_in, hbase_man_out, nb_images_by_partition)
from hbase_manager import HbaseManager

fields_list = [("info", "all_cdr_ids"), ("info", "s3_url"),
               ("info", "all_parent_ids"), ("info", "image_discarded"),
               ("info", "cu_feat_id")]
join_columns_list = [':'.join(x) for x in fields_list]

## MAIN
if __name__ == '__main__':

    # Read job_conf
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    sc = SparkContext(appName="test_read_hbase")
    conf = SparkConf()
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

    # HBase Conf
    hbase_host = job_conf["hbase_host"]
    tab_name = job_conf["tab_name"]
    hbase_man = HbaseManager(sc,
                             conf,
                             hbase_host,
                             tab_name,
                             columns_list=join_columns_list)

    # Run test
    in_rdd = hbase_man.read_hbase_table()
    print in_rdd.first()
    out_rdd = in_rdd.map(check_processed)
    print("out_rdd.first(): {}".format(out_rdd.first()))
    count_rdd = out_rdd.reduce(lambda x, y: (0, x[1] + y[1], x[2] + y[2]))
    print("count_rdd: {}".format(count_rdd))
    missing_rdd = out_rdd.filter(lambda x: x[1] == 0)
    missing_rdd.saveAsTextFile(OUT_PATH)
    print("missing_rdd count: {}".format(missing_rdd.count()))


if __name__ == '__main__':
    from hbase_manager import HbaseManager

    # Read conf
    JOB_CONF = json.load(open("job_conf_count_extr.json", "rt"))
    print(JOB_CONF)
    TAB_NAME_CHECK = JOB_CONF["tab_name_check"]
    HBASE_HOST_SPARK = JOB_CONF["hbase_host"]
    EXTR_TYPE = JOB_CONF["extr_type"]
    EXTR_COL = JOB_CONF["extr_column"]
    OUT_PATH = JOB_CONF["out_path"]

    # Setup spark job
    SC = SparkContext(appName='count_extractions_' + EXTR_TYPE + '_in_' +
                      TAB_NAME_CHECK)
    SC.setLogLevel("ERROR")
    CONF = SparkConf()
    HBASE_MAN_IN = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_CHECK)
    check_table()

    print("Check completed.")
    json_x = [json.loads(x) for x in data[1].split("\n")]
    tup_list = [(doc_id, [doc_id, "info", "insert_ts", str(max_ts - int(ts))])]
    # do we want to keep info:doc_id ?
    for x in json_x:
        tup_list.append(
            (doc_id, [doc_id, x["columnFamily"], x["qualifier"], x["value"]]))
    return tup_list


def fill_cdr_ids_infos(hbase_man_in, hbase_man_out):
    in_rdd = hbase_man_in.read_hbase_table()
    cdr_ids_infos_rdd = in_rdd.flatMap(lambda x: ts_to_cdr_id(x))
    hbase_man_out.rdd2hbase(cdr_ids_infos_rdd)
    print "[fill_cdr_ids_infos] Done."


if __name__ == '__main__':
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    tab_name_in = job_conf["tab_name_in"]
    tab_name_out = job_conf["tab_name_out"]
    hbase_host = job_conf["hbase_host"]
    sc = SparkContext(appName=tab_name_in + '_to_' + tab_name_out)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    #in_columns_list = ["meta:sha1", "meta:columbia_near_dups_sha1", "meta:columbia_near_dups_sha1_dist"]
    #hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_name_in, columns_list=in_columns_list)
    hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_name_in)
    hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_name_out)
    fill_cdr_ids_infos(hbase_man_in, hbase_man_out)
Exemplo n.º 17
0
    hbase_host = job_conf["hbase_host"]
    #min_htid = 143409237
    #max_htid = 153934312
    nb_partitions = job_conf["nb_partitions"]
    #row_start = '52010000'
    #row_stop = '52020000'
    sc = SparkContext(appName='fix_corrupted_links_' + tab_images_name)
    sc.setLogLevel("ERROR")
    c_rdd = sc.textFile(
        'hdfs://memex/user/skaraman/fix_corrupted_links/corrupted_slice_33514258_33515000.csv'
    ).map(lambda x: (x.strip()[1:-1], x.strip()[1:-1])).partitionBy(
        nb_partitions)
    #ct_rdd = sc.textFile('hdfs://memex/user/skaraman/fix_corrupted_links/corrupted_targets.csv').map(lambda x: (x.strip()[1:-1],x.strip()[1:-1])).partitionBy(nb_partitions)
    #cs_rdd = sc.textFile('hdfs://memex/user/skaraman/fix_corrupted_links/corrupted_sources.csv').map(lambda x: (x.strip()[1:-1],x.strip()[1:-1])).partitionBy(nb_partitions)
    conf = SparkConf()
    in_columns_list = [
        "meta:sha1", "meta:columbia_near_dups_sha1",
        "meta:columbia_near_dups_sha1_dist"
    ]
    hbase_man_in = HbaseManager(sc,
                                conf,
                                hbase_host,
                                tab_images_name,
                                columns_list=in_columns_list,
                                row_start=min_htid,
                                row_stop=max_htid)
    #hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_images_name, columns_list=in_columns_list)
    hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_images_name)
    fix_corrupted_links(sc, hbase_man_in, hbase_man_out, c_rdd, c_rdd,
                        nb_partitions)
    out_rdd = tmp_rdd.flatMap(
        lambda x: split_sha1_kv_filter_max_images_discarded(x))
    hbase_man_out.rdd2hbase(out_rdd)


if __name__ == '__main__':
    from hbase_manager import HbaseManager
    job_conf = json.load(open("job_conf.json", "rt"))
    print job_conf
    tab_cdrid_name = job_conf["tab_cdrid_name"]
    hbase_host = job_conf["hbase_host"]
    tab_sha1_infos_name = job_conf["tab_sha1_infos_name"]
    max_images = job_conf["max_images"]
    nb_partitions = job_conf["nb_partitions"]
    sc = SparkContext(appName='sha1_infos_from_' + tab_cdrid_name + '_in_' +
                      tab_sha1_infos_name + '_filter_gt_' + str(max_images))
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    # Get only these column from table tab_cdrid_name
    #"info:sha1"
    #"info:obj_stored_url"
    #"info:obj_parent"
    in_columns_list = ["info:sha1", "info:obj_stored_url", "info:obj_parent"]
    hbase_man_in = HbaseManager(sc,
                                conf,
                                hbase_host,
                                tab_cdrid_name,
                                columns_list=in_columns_list)
    hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_sha1_infos_name)
    fill_sha1_infos(sc, hbase_man_in, hbase_man_out, nb_partitions)
Exemplo n.º 19
0
    iterator = out_rdd.toLocalIterator()
    batch_udpate = []
    for x in iterator:
        batch_udpate.append(x)
        if len(batch_udpate)==batch_update_size:
            batch_rdd = build_batch_rdd(batch_udpate)
            hbase_man_out.rdd2hbase(batch_rdd)
            batch_udpate = []
    # last batch
    if batch_udpate:
        batch_rdd = build_batch_rdd(batch_udpate)
        hbase_man_out.rdd2hbase(batch_rdd)
            

if __name__ == '__main__':
    from hbase_manager import HbaseManager
    job_conf = json.load(open("job_conf.json","rt"))
    print job_conf
    tab_sha1_infos_name = job_conf["tab_sha1_infos"]
    tab_updates_name = job_conf["tab_updates"]
    hbase_host = job_conf["hbase_host"]
    sc = SparkContext(appName='create_first_updates_from_'+tab_sha1_infos_name+'_pushed_to_'+tab_updates_name)
    sc.setLogLevel("ERROR")
    conf = SparkConf()
    #columns = ["info:hash256_cu", "info:s3_url", "info:featnorm_cu", "info:image_discarded", "info:cu_feat_id"]
    # anyway features have been computed but not indexed?
    columns = ["info:s3_url", "info:image_discarded", "info:cu_feat_id"]
    hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_sha1_infos_name, columns_list=columns)
    hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_updates_name)
    create_first_updates(hbase_man_in, hbase_man_out)