es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id( c_options) # Setup SparkContext sc = SparkContext(appName="getimages-" + ingestion_id + dev_release_suffix) conf = SparkConf() log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) # Setup HBase managers join_columns_list = [':'.join(x) for x in fields_list] get_create_table(c_options.tab_sha1_infos_name, c_options) hbase_fullhost = c_options.hbase_host + ':' + str(c_options.hbase_port) hbase_man_sha1infos_join = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name, columns_list=join_columns_list) hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name) get_create_table(c_options.tab_update_name, c_options) hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_update_name) # Setup ES manager es_man = ES(sc, conf, c_options.es_index, c_options.es_domain, c_options.es_host, c_options.es_port, c_options.es_user, c_options.es_pass) es_man.set_output_json() es_man.set_read_metadata()
print inst parser.print_help() es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id(c_options) # Setup SparkContext sc = SparkContext(appName="extract-features-"+ingestion_id+job_suffix) sc.addPyFile('hdfs://memex/user/skaraman/extract-features/network.py') sc.addPyFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.py') sc.addFile('hdfs://memex/user/skaraman/extract-features/imagenet_mean.npy') sc.addFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.npy') conf = SparkConf() log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) # Setup HBase managers # just to be sure we will be able to write out to the table get_create_table(c_options.tab_sha1_infos_name, c_options) get_create_table(c_options.tab_update_name, c_options) # hbase managers hbase_fullhost = c_options.hbase_host+':'+str(c_options.hbase_port) hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name) hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_update_name) # Run extraction print "[START] Starting extracting features for ingestion {}".format(ingestion_id) run_extraction(hbase_man_sha1infos_out, hbase_man_update_out, ingestion_id, c_options) print "[DONE] Extracted features for ingestion {} in {}s.".format(ingestion_id, time.time() - start_time)
def check_updates(hbase_man_in): in_rdd = hbase_man_in.read_hbase_table() nb_updates = in_rdd.count() updates_notprocessed_rdd = in_rdd.filter(update_not_processed) nb_updates_notprocessed = updates_notprocessed_rdd.count() print('We have {} updates not processed out of {}.'.format( nb_updates_notprocessed, nb_updates)) print("[check_updates] DONE.") if __name__ == '__main__': from hbase_manager import HbaseManager #job_conf = json.load(open("job_conf.json","rt")) job_conf = json.load(open("job_conf_dev.json", "rt")) print job_conf tab_updates_name = job_conf["tab_updates_name"] hbase_host = job_conf["hbase_host"] sc = SparkContext(appName='check_updates_' + tab_updates_name) sc.setLogLevel("ERROR") conf = SparkConf() # read rows starting from 'index_update_' in 'tab_updates_name' hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_updates_name, row_start='index_update_', row_end='index_update_~') check_updates(hbase_man_in)
print query es_rdd = es_man.es2rdd(query) images_hb_rdd = es_rdd.flatMap(lambda x: create_images_tuple(x)) hbase_man.rdd2hbase(images_hb_rdd) if __name__ == '__main__': # Read job_conf job_conf = json.load(open("job_conf.json", "rt")) print job_conf # Set parameters job_conf tab_name = job_conf["tab_name"] hbase_host = job_conf["hbase_host"] new_crawler = job_conf["new_crawler"] es_index = job_conf["es_index"] es_domain = job_conf["es_domain"] es_host = job_conf["es_host"] es_port = job_conf["es_port"] es_user = job_conf["es_user"] es_pass = job_conf["es_pass"] # Update fields based on new_crawler value set_fields_filter(new_crawler) # Start job sc = SparkContext(appName=tab_name) sc.setLogLevel("ERROR") conf = SparkConf() es_man = ES(sc, conf, es_index, es_domain, es_host, es_port, es_user, es_pass) hbase_man = HbaseManager(sc, conf, hbase_host, tab_name) move_data(es_man, hbase_man)
return kv def count_dups(hbase_man_timestamp, outfilename): in_rdd = hbase_man_timestamp.read_hbase_table() count_rdd = in_rdd.map(lambda x: split_ts_cdrid_rowkey(x)) summary_rdd = count_rdd.reduceByKey(lambda x, y: x + y).map( lambda (x, y): (y, x)).sortByKey( 0, 1).map(lambda (x, y): (y, x)).map(lambda x: x[0] + ":" + str(x[1])) print summary_rdd.count() summary_rdd.saveAsTextFile(outfilename) if __name__ == '__main__': # Read job_conf job_conf = json.load(open("job_conf.json", "rt")) print job_conf # Set parameters job_conf tab_timestamp_name = job_conf["tab_timestamp_name"] hbase_host = job_conf["hbase_host"] outfilename = job_conf["outfilename"] # Start job sc = SparkContext(appName=tab_timestamp_name + "_count_dups") conf = SparkConf() hbase_man_timestamp = HbaseManager(sc, conf, hbase_host, tab_timestamp_name) count_dups(hbase_man_timestamp, outfilename)
# escorts_images_sha1_infos_from_ts_subsampled_newformat => ht_images_infos_merged_subsampled # discard ad:*. ext:sbcmdline (at least for real transform). just do not put them in mappings # mappings should be an array of arrays like: # ["ext:dlib*", "data:dlib*"] # ["ext:sbpycaffe*", "data:sbpycaffe*"] # ["info:s3_url", "data:s3_url"] # Could be a parameter in conf HAPPYBASE_HOST = '10.108.16.137' # TODO: should we also transform update table? # Try to create "tab_name_out" HBASE_TIMEOUT = None NB_THREADS = 1 POOL = happybase.ConnectionPool(size=NB_THREADS, host=HAPPYBASE_HOST, timeout=HBASE_TIMEOUT) with POOL.connection() as CONN: get_create_table(TAB_NAME_OUT, CONN, TAB_OUT_FAMILIES) # Setup spark job SC = SparkContext(appName='transform_' + TAB_NAME_IN + '_to_' + TAB_NAME_OUT) SC.setLogLevel("ERROR") CONF = SparkConf() HBASE_MAN_IN = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_IN) HBASE_MAN_OUT = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_OUT) transform_table() print("Transformation completed.")
max_images_reduce) return c def check_cdrids(hbase_man_in): in_rdd = hbase_man_in.read_hbase_table() nb_cdrids = in_rdd.count() print('We have {} images.'.format(nb_cdrids)) cdrids_wsha1valid_rdd = in_rdd.filter(check_wsha1valid_cdrid) nb_cdrids_wsha1valid = cdrids_wsha1valid_rdd.count() print('We have {} valid images with SHA1 out of {} total images.'.format( nb_cdrids_wsha1valid, nb_cdrids)) sha1_rdd = cdrids_wsha1valid_rdd.flatMap( cdrid_key_to_sha1_key).reduceByKey(reduce_sha1_infos_discarding) nb_sha1valid = sha1_rdd.count() print('We have {} valid unique SHA1 images.'.format(nb_sha1valid)) print("[check_cdrids] DONE.") if __name__ == '__main__': from hbase_manager import HbaseManager job_conf = json.load(open("job_conf.json", "rt")) print job_conf tab_cdrid_name = job_conf["tab_cdrid_name"] hbase_host = job_conf["hbase_host"] sc = SparkContext(appName='check_cdrids_' + tab_cdrid_name) sc.setLogLevel("ERROR") conf = SparkConf() hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_cdrid_name) check_cdrids(hbase_man_in)
return out def fill_missing_sha1(sc, hbase_man, hbase_man_missing): in_rdd = hbase_man.read_hbase_table() out_rdd = in_rdd.flatMap(lambda x: has_sha1(x)) hbase_man_missing.rdd2hbase(out_rdd) if __name__ == '__main__': from hbase_manager import HbaseManager job_conf = json.load(open("job_conf.json", "rt")) print job_conf tab_name = job_conf["tab_name"] tab_missing_name = job_conf["tab_missing_name"] hbase_host = job_conf["hbase_host"] in_columns_list = [ "info:obj_stored_url", "info:crawl_data.image_id", "info:sha1" ] sc = SparkContext(appName='missing-sha1_' + tab_name + '_to_' + tab_missing_name) sc.setLogLevel("ERROR") conf = SparkConf() hbase_man = HbaseManager(sc, conf, hbase_host, tab_name, columns_list=in_columns_list) hbase_man_missing = HbaseManager(sc, conf, hbase_host, tab_missing_name) fill_missing_sha1(sc, hbase_man, hbase_man_missing)
return tup_list def fill_sim(sc, hbase_man_in, hbase_man_out): in_rdd = hbase_man_in.read_hbase_table() sim_images_hb_rdd = in_rdd.flatMap(lambda x: create_sim_images_tuple(x)) hbase_man_out.rdd2hbase(sim_images_hb_rdd) print "[fill_sim] Done." if __name__ == '__main__': job_conf = json.load(open("job_conf.json", "rt")) print job_conf tab_name_in = job_conf["tab_name_in"] tab_name_out = job_conf["tab_name_out"] hbase_host = job_conf["hbase_host"] sc = SparkContext(appName=tab_name_in + '_to_' + tab_name_out) sc.setLogLevel("ERROR") conf = SparkConf() in_columns_list = [ "meta:sha1", "meta:columbia_near_dups_sha1", "meta:columbia_near_dups_sha1_dist" ] hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_name_in, columns_list=in_columns_list) hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_name_out) fill_sim(sc, hbase_man_in, hbase_man_out)
return [(key, [key, "info", "precomp_sim", "True"])] def mark_precomp_sim(hbase_man_in, hbase_man_out): in_rdd = hbase_man_in.read_hbase_table() existing_sims = in_rdd.keys() existing_sims_count = existing_sims.count() print("existing_sims count: {}".format(existing_sims_count)) sample_existing_sims = existing_sims.first() print("existing_sims first: {}".format(sample_existing_sims)) out_rdd = existing_sims.flatMap(prepare_mark_precomp) sample_out_rdd = out_rdd.take(5) print("out_rdd sample: {}".format(sample_out_rdd)) hbase_man_out.rdd2hbase(out_rdd) if __name__ == '__main__': from hbase_manager import HbaseManager job_conf = json.load(open("job_conf.json", "rt")) print job_conf tab_sim_name = job_conf["tab_sim"] tab_sha1_infos_name = job_conf["tab_sha1_infos"] hbase_host = job_conf["hbase_host"] sc = SparkContext(appName='mark-precomp-sim_from_' + tab_sim_name + '_to' + tab_sha1_infos_name) sc.setLogLevel("ERROR") conf = SparkConf() hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_sim_name) hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_sha1_infos_name) mark_precomp_sim(hbase_man_in, hbase_man_out)
out = [(x[0], '{'+','.join(['"'+field[0]+':'+field[1]+'":["'+'","'.join(x[1][field[0]+":"+field[1]])+'"]' for field in fields_list])+'}')] # print out when more than one cdr id for a sha1 #if len(x[1][fields_list[1][0]+":"+fields_list[1][1]])>1: # print out return out def fill_sha1_infos(sc, hbase_man_in, outfile_name): in_rdd = hbase_man_in.read_hbase_table() tmp_rdd = in_rdd.flatMap(lambda x: to_sha1_key(x)).reduceByKey(reduce_sha1_infos) # array not compatible with SequenceFile output out_rdd = tmp_rdd.flatMap(lambda x: split_sha1_kv_json(x)) out_rdd.saveAsSequenceFile(outfile_name) if __name__ == '__main__': from hbase_manager import HbaseManager job_conf = json.load(open("job_conf.json","rt")) print job_conf tab_cdrid_name = job_conf["tab_cdrid_name"] hbase_host = job_conf["hbase_host"] outfile_name = job_conf["outfile_name_hdfs"] sc = SparkContext(appName='sha1_infos_from_'+tab_cdrid_name+'_to_hdfs_'+outfile_name) sc.setLogLevel("ERROR") conf = SparkConf() # Get only these column from table tab_cdrid_name #"info:sha1" #"info:obj_stored_url" #"info:obj_parent" in_columns_list = ["info:sha1", "info:obj_stored_url", "info:obj_parent"] hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_cdrid_name, columns_list=in_columns_list) fill_sha1_infos(sc, hbase_man_in, outfile_name)
import sys print(sys.version) import json from pyspark import SparkContext, SparkConf from hbase_manager import HbaseManager fields_list = [("info","all_cdr_ids"), ("info","s3_url"), ("info","all_parent_ids"), ("info","image_discarded"), ("info","cu_feat_id")] join_columns_list = [':'.join(x) for x in fields_list] ## MAIN if __name__ == '__main__': # Read job_conf job_conf = json.load(open("job_conf.json","rt")) print job_conf sc = SparkContext(appName="test_read_hbase") conf = SparkConf() log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) # HBase Conf hbase_host = job_conf["hbase_host"] tab_name = job_conf["tab_name"] hbase_man = HbaseManager(sc, conf, hbase_host, tab_name, columns_list=join_columns_list) # Run test in_rdd = hbase_man.read_hbase_table() print in_rdd.first()
nb_images_to_dl = images_to_dl_rdd.count() nb_partitions = int(np.ceil(nb_images_to_dl / nb_images_by_partition)) print( 'We have {} images, we want a maximum of {} images by partition. So we will partition in {} partitions.' .format(nb_images_to_dl, nb_images_by_partition, nb_partitions)) out_rdd = images_to_dl_rdd.partitionBy(nb_partitions).flatMap( lambda x: download_image(x)) hbase_man_out.rdd2hbase(out_rdd) nb_images_dled = out_rdd.count() print('We have downloaded {} images.'.format(nb_images_dled)) print("[fill_binary_image] DONE.") if __name__ == '__main__': from hbase_manager import HbaseManager job_conf = json.load(open("job_conf.json", "rt")) print job_conf tab_sha1_name = job_conf["tab_sha1_name"] hbase_host = job_conf["hbase_host"] nb_images_by_partition = job_conf["nb_images_by_partition"] sc = SparkContext(appName='dl_images_' + tab_sha1_name) sc.setLogLevel("ERROR") conf = SparkConf() hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_sha1_name, columns=["info:image", "info:s3_url"]) hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_sha1_name) fill_binary_image(hbase_man_in, hbase_man_out, nb_images_by_partition)
from hbase_manager import HbaseManager fields_list = [("info", "all_cdr_ids"), ("info", "s3_url"), ("info", "all_parent_ids"), ("info", "image_discarded"), ("info", "cu_feat_id")] join_columns_list = [':'.join(x) for x in fields_list] ## MAIN if __name__ == '__main__': # Read job_conf job_conf = json.load(open("job_conf.json", "rt")) print job_conf sc = SparkContext(appName="test_read_hbase") conf = SparkConf() log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) # HBase Conf hbase_host = job_conf["hbase_host"] tab_name = job_conf["tab_name"] hbase_man = HbaseManager(sc, conf, hbase_host, tab_name, columns_list=join_columns_list) # Run test in_rdd = hbase_man.read_hbase_table() print in_rdd.first()
out_rdd = in_rdd.map(check_processed) print("out_rdd.first(): {}".format(out_rdd.first())) count_rdd = out_rdd.reduce(lambda x, y: (0, x[1] + y[1], x[2] + y[2])) print("count_rdd: {}".format(count_rdd)) missing_rdd = out_rdd.filter(lambda x: x[1] == 0) missing_rdd.saveAsTextFile(OUT_PATH) print("missing_rdd count: {}".format(missing_rdd.count())) if __name__ == '__main__': from hbase_manager import HbaseManager # Read conf JOB_CONF = json.load(open("job_conf_count_extr.json", "rt")) print(JOB_CONF) TAB_NAME_CHECK = JOB_CONF["tab_name_check"] HBASE_HOST_SPARK = JOB_CONF["hbase_host"] EXTR_TYPE = JOB_CONF["extr_type"] EXTR_COL = JOB_CONF["extr_column"] OUT_PATH = JOB_CONF["out_path"] # Setup spark job SC = SparkContext(appName='count_extractions_' + EXTR_TYPE + '_in_' + TAB_NAME_CHECK) SC.setLogLevel("ERROR") CONF = SparkConf() HBASE_MAN_IN = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_CHECK) check_table() print("Check completed.")
json_x = [json.loads(x) for x in data[1].split("\n")] tup_list = [(doc_id, [doc_id, "info", "insert_ts", str(max_ts - int(ts))])] # do we want to keep info:doc_id ? for x in json_x: tup_list.append( (doc_id, [doc_id, x["columnFamily"], x["qualifier"], x["value"]])) return tup_list def fill_cdr_ids_infos(hbase_man_in, hbase_man_out): in_rdd = hbase_man_in.read_hbase_table() cdr_ids_infos_rdd = in_rdd.flatMap(lambda x: ts_to_cdr_id(x)) hbase_man_out.rdd2hbase(cdr_ids_infos_rdd) print "[fill_cdr_ids_infos] Done." if __name__ == '__main__': job_conf = json.load(open("job_conf.json", "rt")) print job_conf tab_name_in = job_conf["tab_name_in"] tab_name_out = job_conf["tab_name_out"] hbase_host = job_conf["hbase_host"] sc = SparkContext(appName=tab_name_in + '_to_' + tab_name_out) sc.setLogLevel("ERROR") conf = SparkConf() #in_columns_list = ["meta:sha1", "meta:columbia_near_dups_sha1", "meta:columbia_near_dups_sha1_dist"] #hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_name_in, columns_list=in_columns_list) hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_name_in) hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_name_out) fill_cdr_ids_infos(hbase_man_in, hbase_man_out)
hbase_host = job_conf["hbase_host"] #min_htid = 143409237 #max_htid = 153934312 nb_partitions = job_conf["nb_partitions"] #row_start = '52010000' #row_stop = '52020000' sc = SparkContext(appName='fix_corrupted_links_' + tab_images_name) sc.setLogLevel("ERROR") c_rdd = sc.textFile( 'hdfs://memex/user/skaraman/fix_corrupted_links/corrupted_slice_33514258_33515000.csv' ).map(lambda x: (x.strip()[1:-1], x.strip()[1:-1])).partitionBy( nb_partitions) #ct_rdd = sc.textFile('hdfs://memex/user/skaraman/fix_corrupted_links/corrupted_targets.csv').map(lambda x: (x.strip()[1:-1],x.strip()[1:-1])).partitionBy(nb_partitions) #cs_rdd = sc.textFile('hdfs://memex/user/skaraman/fix_corrupted_links/corrupted_sources.csv').map(lambda x: (x.strip()[1:-1],x.strip()[1:-1])).partitionBy(nb_partitions) conf = SparkConf() in_columns_list = [ "meta:sha1", "meta:columbia_near_dups_sha1", "meta:columbia_near_dups_sha1_dist" ] hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_images_name, columns_list=in_columns_list, row_start=min_htid, row_stop=max_htid) #hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_images_name, columns_list=in_columns_list) hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_images_name) fix_corrupted_links(sc, hbase_man_in, hbase_man_out, c_rdd, c_rdd, nb_partitions)
out_rdd = tmp_rdd.flatMap( lambda x: split_sha1_kv_filter_max_images_discarded(x)) hbase_man_out.rdd2hbase(out_rdd) if __name__ == '__main__': from hbase_manager import HbaseManager job_conf = json.load(open("job_conf.json", "rt")) print job_conf tab_cdrid_name = job_conf["tab_cdrid_name"] hbase_host = job_conf["hbase_host"] tab_sha1_infos_name = job_conf["tab_sha1_infos_name"] max_images = job_conf["max_images"] nb_partitions = job_conf["nb_partitions"] sc = SparkContext(appName='sha1_infos_from_' + tab_cdrid_name + '_in_' + tab_sha1_infos_name + '_filter_gt_' + str(max_images)) sc.setLogLevel("ERROR") conf = SparkConf() # Get only these column from table tab_cdrid_name #"info:sha1" #"info:obj_stored_url" #"info:obj_parent" in_columns_list = ["info:sha1", "info:obj_stored_url", "info:obj_parent"] hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_cdrid_name, columns_list=in_columns_list) hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_sha1_infos_name) fill_sha1_infos(sc, hbase_man_in, hbase_man_out, nb_partitions)
iterator = out_rdd.toLocalIterator() batch_udpate = [] for x in iterator: batch_udpate.append(x) if len(batch_udpate)==batch_update_size: batch_rdd = build_batch_rdd(batch_udpate) hbase_man_out.rdd2hbase(batch_rdd) batch_udpate = [] # last batch if batch_udpate: batch_rdd = build_batch_rdd(batch_udpate) hbase_man_out.rdd2hbase(batch_rdd) if __name__ == '__main__': from hbase_manager import HbaseManager job_conf = json.load(open("job_conf.json","rt")) print job_conf tab_sha1_infos_name = job_conf["tab_sha1_infos"] tab_updates_name = job_conf["tab_updates"] hbase_host = job_conf["hbase_host"] sc = SparkContext(appName='create_first_updates_from_'+tab_sha1_infos_name+'_pushed_to_'+tab_updates_name) sc.setLogLevel("ERROR") conf = SparkConf() #columns = ["info:hash256_cu", "info:s3_url", "info:featnorm_cu", "info:image_discarded", "info:cu_feat_id"] # anyway features have been computed but not indexed? columns = ["info:s3_url", "info:image_discarded", "info:cu_feat_id"] hbase_man_in = HbaseManager(sc, conf, hbase_host, tab_sha1_infos_name, columns_list=columns) hbase_man_out = HbaseManager(sc, conf, hbase_host, tab_updates_name) create_first_updates(hbase_man_in, hbase_man_out)