def setup(): delegate.fakeload() from infogami.utils import types types.register_type('/i18n(/.*)?/strings.[^/]*', '/type/i18n') for site in db.get_all_sites(): load_strings(site)
def spark_split_jobs(): # List of states and sites state = {} site = {} db = get_all_sites() print("TOTAL SITES %u" % len(db)) for site_info in db: # Accumulate by state if not site_info['state'] in state: state[site_info['state']] = [site_info['site_id']] else: state[site_info['state']].append(site_info['site_id']) # Reverse index by site_id if not site_info['site_id'] in site: site[site_info['site_id']] = site_info ''' # Print states and sites min_sites = 9999999 max_sites = 0 min_state = "" max_state = "" for s in state.keys(): num_sites = len(state[s]) print("STATE: %s SITES: %u" % (s, len(state[s]))) if s is None: continue if num_sites < min_sites: min_sites = num_sites min_state = s if num_sites > max_sites: max_sites = num_sites max_state = s print("TOTAL STATES => %u" % (len(state.keys()))) print("MAX SITES STATE %s SITES %u" % (max_state, max_sites)) print("MIN SITES STATE %s SITES %u" % (min_state, min_sites)) ''' site_list_of_lists = [] for s in state: site_list_of_lists.append(state[s]) # Distributed the states evenly in the list zipped_list = evenly_spaced(site_list_of_lists) # Break list into chunks run_id = 1 for i in range(0, len(zipped_list), SITES_PER_JOB): split_list = zipped_list[i:i + SITES_PER_JOB] #print_state_count(split_list, site) # Move parquet files to RUNx folder move_parquet(split_list, run_id) # Run the spark job on the RUNx folder files #spark_run_split_job(run_id) run_id = run_id + 1
def load_all(): def load_macros(site): for m in db.get_all_macros(site): _load_macro(m, lazy=True) def load_templates(site): for t in db.get_all_templates(site): _load_template(t, lazy=True) for site in db.get_all_sites(): context.site = site load_macros(site) load_templates(site)
def spark_split_jobs(): # List of states and sites state = {} site = {} db = get_all_sites() print("TOTAL SITES %u" % len(db)) for site_info in db: # Accumulate by state if not site_info['state'] in state: state[site_info['state']] = [site_info['site_id']] else: state[site_info['state']].append(site_info['site_id']) # Reverse index by site_id if not site_info['site_id'] in site: site[site_info['site_id']] = site_info site_list_of_lists = [] for s in state: site_list_of_lists.append(state[s]) # Distributed the states evenly in the list zipped_list = evenly_spaced(site_list_of_lists) run_id = 2 split_list = zipped_list[5000:10000] move_parquet(split_list, run_id) run_id = run_id + 1 threads = [] for i in range(0, len(zipped_list), SITES_PER_JOB): if i < 2: continue split_list = zipped_list[i:i + SITES_PER_JOB] #print_state_count(split_list, site) # Move parquet files to RUNx folder t = threading.Thread(target=move_parquet, args=(split_list, run_id)) t.start() time.sleep(1) threads.append(t) #move_parquet(split_list, run_id) # Run the spark job on the RUNx folder files run_id = run_id + 1 for t in threads: t.join() print("ALL FILES COPIED!")