def control(batch_enhanced, s): batch_enhanced_full = utils.read_from_s3( utils.file_name(sufix="_batch_enhanced_full"), seed=[], directory=s["s3dir"]) print("Length of enhanced pre batch: " + str(len(batch_enhanced_full))) enhanced_w_batch = add_batch(batch_enhanced_full, batch_enhanced) utils.write_to_s3(enhanced_w_batch, utils.file_name(sufix="_batch_enhanced_full"), directory=s["s3dir"]) filtered_label_dict = make_filtered_label_dict(batch_enhanced_full) return filtered_label_dict
def save(self): hparams_file = os.path.join( self.model_dir, "{}_config.yml".format(file_name(self.config))) print_out(" saving config to %s" % hparams_file) to_dump_dict = dict(self.__dict__) if to_dump_dict['train_data']: to_dump_dict['train_data'] = os.path.abspath( to_dump_dict['train_data']) if to_dump_dict['test_data']: to_dump_dict['test_data'] = os.path.abspath( to_dump_dict['test_data']) if to_dump_dict['dev_data']: to_dump_dict['dev_data'] = os.path.abspath( to_dump_dict['dev_data']) if to_dump_dict['pretrain_data']: to_dump_dict['pretrain_data'] = os.path.abspath( to_dump_dict['pretrain_data']) else: to_dump_dict.pop('pretrain_data') if to_dump_dict['vocab_file']: to_dump_dict['vocab_file'] = os.path.abspath( to_dump_dict['vocab_file']) with codecs.getwriter("utf-8")(open(hparams_file, "wb")) as f: yaml.dump(to_dump_dict, f, default_flow_style=False)
def change_png2eps(root='./result/figures'): files = file_name(root)[0] for index, name in enumerate(files): img = plt.imread('{}/{}'.format(root, name)) f_index = name.find('.', 0) f_name = name[:f_index] plt.imsave('{}/test_{}.eps'.format(root, f_name), img, format='eps', dpi=500)
def control(filtered_label_dict, s): user_data, filtered_label_dict = update_user_info(filtered_label_dict, s) filtered_label_dict = update_dict_with_user_info(filtered_label_dict, user_data) utils.write_to_s3( filtered_label_dict, utils.file_name(prefix='_batch_filt_label_dict_enhanced_'), directory=s["s3dir"]) return filtered_label_dict
def get_library_list(): """Get library actionlists""" global __LIBRARY_ACTIONLISTS if __LIBRARY_ACTIONLISTS: return __LIBRARY_ACTIONLISTS pattern = os.path.join(config.PHATCH_ACTIONLISTS_PATH, '*.phatch') actionlist_paths = glob.glob(pattern) __LIBRARY_ACTIONLISTS = dict((utils.file_name(actionlist_file), eval_safe(open(actionlist_file).read())) for actionlist_file in actionlist_paths) return __LIBRARY_ACTIONLISTS
def control(batch_enhanced,s): # statuses_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory='data-aws/gen_two/') batch_enhanced_full = utils.read_from_s3(utils.file_name( sufix = "_batch_enhanced_full"), seed=[], directory=s["s3dir"]) print("Length of enhanced pre batch: " + str(len(batch_enhanced_full))) # enhanced_w_batch = add_batch(batch_enhanced_full, batch_enhanced) # utils.write_to_s3(enhanced_w_batch, utils.file_name( sufix = "_batch_enhanced_full"), directory=s["s3dir"]) # utils.write_to_s3(processed_list, utils.file_name( sufix = "_processed_list"), directory=s["s3dir"]) filtered_label_dict = make_filtered_label_dict(batch_enhanced_full) # utils.write_to_s3(filtered_label_dict, utils.file_name( prefix = "_batch_filt_label_dict_enhanced_"), directory=s["s3dir"]) return filtered_label_dict
def get_library_list(): """Get library actionlists""" global __LIBRARY_ACTIONLISTS if __LIBRARY_ACTIONLISTS: return __LIBRARY_ACTIONLISTS pattern = os.path.join(config.PHATCH_ACTIONLISTS_PATH, '*.phatch') actionlist_paths = glob.glob(pattern) __LIBRARY_ACTIONLISTS = dict( ( utils.file_name(actionlist_file), eval_safe(open(actionlist_file).read())) for actionlist_file in actionlist_paths) return __LIBRARY_ACTIONLISTS
def get_actions(): """Get phatch actions Returns a dictionary that maps action names to objects""" global __ACTIONS if __ACTIONS: return __ACTIONS action_names = [ utils.file_name(action_file) for action_file in glob.glob( os.path.join(config.PHATCH_ACTIONS_PATH, '*.py')) ] default_values = get_defaults() __ACTIONS = dict( (name, __import__('actions.%s' % name, name, fromlist=['actions']).Action()) for name in action_names if name != '__init__') for name, fields in default_values.iteritems(): set_action_fields(__ACTIONS[name], fields) return __ACTIONS
def execute_actionlists(input, actionlists=None, options=''): """Execute a list of actionlists on input path. If no actionlist was given all actionlists will be executed""" errors = [] if not actionlists: actionlists = dict((utils.file_name(path), os.path.join(config.OUT_ACTIONLISTS_PATH, path)) for path in os.listdir(config.OUT_ACTIONLISTS_PATH)) total = len(actionlists) for i, name in enumerate(sorted(actionlists)): sys.stdout.write('\rRunning %s/%s %s' % (i + 1, total, name[:50].ljust(50))) sys.stdout.flush() if not execute_actionlist(input, actionlists[name], options): errors.append(name) print return errors
def assemble_segements(file_path): utils.log("assembling segements ... ") file_list = os.listdir(utils.dir_name(file_path)) num_file = 0 for fn in file_list: if (re.findall(utils.file_name(file_path) + ".\d+", fn)): num_file = num_file + 1 fp = open(file_path, 'wb') for i in range(num_file): fn = file_path + '.' + str(i) f = open(fn, 'rb') fp.write(f.read()) f.close() os.remove(file_path + '.' + str(i)) fp.close() utils.log("finished assembling segements")
def get_actions(): """Get phatch actions Returns a dictionary that maps action names to objects""" global __ACTIONS if __ACTIONS: return __ACTIONS action_names = [utils.file_name(action_file) for action_file in glob.glob(os.path.join(config.PHATCH_ACTIONS_PATH, '*.py'))] default_values = get_defaults() __ACTIONS = dict( ( name, __import__( 'actions.%s' % name, name, fromlist=['actions']).Action()) for name in action_names if name != '__init__') for name, fields in default_values.iteritems(): set_action_fields(__ACTIONS[name], fields) return __ACTIONS
def execute_actionlists(input, actionlists=None, options=''): """Execute a list of actionlists on input path. If no actionlist was given all actionlists will be executed""" errors = [] if not actionlists: actionlists = dict( ( utils.file_name(path), os.path.join(config.OUT_ACTIONLISTS_PATH, path)) for path in os.listdir(config.OUT_ACTIONLISTS_PATH)) total = len(actionlists) for i, name in enumerate(sorted(actionlists)): sys.stdout.write( '\rRunning %s/%s %s' % ( i + 1, total, name[:50].ljust(50))) sys.stdout.flush() if not execute_actionlist(input, actionlists[name], options): errors.append(name) print return errors
# all_users_lookup = [u for u in all_users if u not in user_data] ## Cache code## user_data, all_users_lookup = utils.get_from_cache_m( all_users, "user_data") utils.log(len(all_users), "Number total users in set: ") utils.log(len(all_users_lookup), "Number users needing lookup: ") if len(all_users_lookup) > 0: user_chunks = make_user_chunks(all_users_lookup, 100) for this_lookup in user_chunks: user_dict_lookup = do_user_lookup(this_lookup, s) user_data.update(user_dict_lookup) return user_data, filtered_label_dict def control(filtered_label_dict, s): user_data, filtered_label_dict = update_user_info(filtered_label_dict, s) filtered_label_dict = update_dict_with_user_info(filtered_label_dict, user_data) utils.write_to_s3( filtered_label_dict, utils.file_name(prefix='_batch_filt_label_dict_enhanced_'), directory=s["s3dir"]) return filtered_label_dict if __name__ == "__main__": sd = utils.getDefaultSettings() filtered_label_dict = utils.read_from_s3( utils.file_name(prefix='_batch_filt_label_dict_enhanced_'), directory=sd["s3dir"]) control(filtered_label_dict, sd)
dt[key]["statuses"].append(value) else: dt[key] = {"statuses":[value]} return None def make_filtered_label_dict(batch_enhanced, label_dict, threshhold = 1): processed_list = set() for status in batch_enhanced: norm_labels = status["satellite_enhanced"]["combined_labels"] if len(norm_labels) > 0: # for label in norm_labels[0]: # Experimental: Choose first one uate_dict_nodup(label_dict, norm_labels[0], status) for l, v in label_dict.items(): label_dict[l]["count"] = len(v) for s in label_dict[l]["statuses"]: processed_list.add(int(s["id"])) return label_dict, list(processed_list) def control(batch_enhanced,s): filtered_label_dict = utils.read_from_s3(utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"]) filtered_label_dict, processed_list = make_filtered_label_dict(batch_enhanced, filtered_label_dict) utils.write_to_s3(filtered_label_dict, utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"]) utils.write_to_s3(processed_list, utils.file_name( sufix = "_processed_list_fld"), directory=s["s3dir"]) return filtered_label_dict if __name__ == "__main__": sd = utils.getDefaultSettings() batch_enhanced = utils.read_from_s3(utils.file_name( sufix = "_batch_enhanced_d"), directory=sd["s3dir"]) # statuses_enhanced = utils.read_from_s3(utils.file_name( sufix = "_batch_enhanced_full"), seed=[], directory=sd["s3dir"]) control(batch_enhanced, sd)
max_embed = this_embed #for links, add to "labels" so we can do the short url processing after #for refs, create entry "labels_proc so we preserve the original ref and can store the quoted status. root["satellite_enhanced"]["labels"]["quoted_labels_links_deep"] = max_url root["satellite_enhanced"]["labels"]["quoted_labels_twrefs_deep"] = max_embed if max_embed != None and max_embed != []: root["satellite_enhanced"]["labels"]["quoted_labels_twrefs_deep_status"] = qt else: existing_cnt += 1 utils.log(existing_cnt, "Existing Count: ") return statuses_enhanced def control(batch_enhanced, s): utils.log("", "Starting deep trace") # batch_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory="data-aws/gen_two/") batch_enhanced = trace_links_down(batch_enhanced, s) # utils.write_to_s3( # batch_enhanced, # utils.file_name(batch_enhanced, sufix="_batch_enhanced_c"), # directory=s["s3dir"]) return batch_enhanced if __name__ == "__main__": sd = utils.getDefaultSettings() batch_enhanced = utils.read_from_s3( utils.file_name(sufix="_batch_enhanced"), directory=sd["s3dir"]) control(batch_enhanced, sd)
def filter_on_day(stat): current_day_key = utils.file_date() dd = utils.make_local(stat["created_at"]) key = str(dd.day) + "-" + str(dd.month) + "-" + str(dd.year) return_bool = key == current_day_key return return_bool def enhance(batch_enhanced): filtered_enhanced_batch = [] utils.log(len(batch_enhanced), "Number batch statuses: ") for stat in batch_enhanced: enhanced = {} enhanced["created_at_tz"] = add_time_zone_date(stat) enhanced["labels"] = get_combined_labels(stat) ref_cnt = 0 for key, val in enhanced["labels"].items(): ref_cnt += len(val) if ref_cnt > 0: stat["satellite_enhanced"] = enhanced filtered_enhanced_batch.append(stat) utils.log(len(filtered_enhanced_batch), "Number enhanced batch statuses: ") return filtered_enhanced_batch def control(date_filtered_batch, s): batch_enhanced = enhance(date_filtered_batch) return batch_enhanced if __name__ == "__main__": sd = utils.getDefaultSettings() date_filtered_batch = utils.read_from_s3(utils.file_name(sufix="_batch_enhanced_full"), directory=sd["s3dir"]) control(date_filtered_batch, sd)
def control(batch_enhanced,s): filtered_label_dict = utils.read_from_s3(utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"]) filtered_label_dict, processed_list = make_filtered_label_dict(batch_enhanced, filtered_label_dict) utils.write_to_s3(filtered_label_dict, utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"]) utils.write_to_s3(processed_list, utils.file_name( sufix = "_processed_list_fld"), directory=s["s3dir"]) return filtered_label_dict