def open_desc(tfile): cdb=utils.open_cdb(tfile) dlist=[] klist=utils.get_all_keys(cdb) cat=cdb["description"] for key in klist: if key.endswith("desc"): dlist.append((json.loads(str(cdb[bytes(key, 'utf-8')], 'utf-8')), cat)) return(dlist)
def score_cdb(tfile, cats, keywords, cl): cdb = utils.open_cdb(tfile) score, uscore = {}, {} for key in utils.get_all_keys(cdb): if key.endswith("desc"): cati = check_categories( cl, cats, json.loads(str(cdb[bytes(key, 'utf-8')], 'utf-8'))) keywi = check_keywords(keywords, str(cdb[bytes(key, 'utf-8')], 'utf-8')) value = int(cati) + keywi score[cdb[bytes(key[:-4] + "url", 'utf-8')]] = value print("VAL:", cati, keywi) return (score)
def agg_text(places): neo_dict={} for ffile in places: print(ffile) try: cdb=utils.open_cdb(ffile) for key in utils.get_all_keys(cdb): if key.startswith("http"): scan=magic.Magic(mime=True).from_buffer(cdb[bytes(key, 'utf-8')]) print(scan) if scan.startswith('text') is True: print(key) page=clean_html(str(cdb[bytes(key, 'utf-8')], 'utf-8')) if len(page) > 18: title=find_title(str(cdb[bytes(key, 'utf-8')], 'utf-8')) neo_dict[title] = page neo_dict["url-"+title]=key cdb.close() except: pass return(neo_dict)
def process_event(event): logger.debug('processing event ') logger.debug(event) inject_fault = True if random.randint(0, 100) <= int( config.config['fault_injection_rate_in_percent']) else False if inject_fault: possible_fields_for_modification = utils.get_all_keys(event) # select fault injection type # type: drop_key_value, change_value list_of_fault_injection_types = ['drop_key_value', 'change_value'] selected_injection_type = list_of_fault_injection_types[random.randint( 0, len(list_of_fault_injection_types) - 1)] logger.debug('selected injection type: ' + selected_injection_type) logger.debug('possible keys for modification: ') logger.debug(possible_fields_for_modification) key_value_to_modify = possible_fields_for_modification[random.randint( 0, len(possible_fields_for_modification) - 1)] if selected_injection_type == 'drop_key_value': event = delete_keys_from_dict(event, [key_value_to_modify]) elif selected_injection_type == 'change_value': event = modify_value_in_dict(event, [key_value_to_modify]) logger.info('run ' + selected_injection_type + ' on ' + key_value_to_modify) else: logger.info('did not modify event') logger.debug('remaining event:') logger.debug(event) return event
import numpy as np from utils import get_all_keys, get_existing_keys existing_keys = get_existing_keys('keys_for_test') existing_train_keys = get_existing_keys('keys_for_train') all_test_keys = get_all_keys('/media/natasha/Data/Landmark Kaggle/test.csv') print('all_test_keys ', len(all_test_keys)) print('existing_keys ', len(existing_keys)) input() neighbors = np.load('100_nearest_neighbors_resnet.npy') print('neighbors', neighbors) def get_neighbors(neighbors, existing_train_keys, existing_test_keys, test_key): index = np.where(existing_test_keys.__eq__(str(test_key)))[0][0] neighbors_indices = np.array(neighbors[index]) return existing_train_keys[neighbors_indices] def get_dummy_neighbors(existing_train_keys): result = [] for i in np.random.random_integers(low=0, high=existing_train_keys.shape[0] - 1, size=100): result.append(existing_train_keys[i]) return result
schema = json.load(f) except Exception as e: print("Error could not load :", schema_file, "\n", e) exit(-1) for config_file in dirs: params_in_file = set() parameters_not_in_schema = set() # print("Opening: ", config_file) with open(config_file, 'r+') as f: try: config = json.load(f) except Exception as e: print("Error could not load :", config_file, "\n", e) continue utils.get_all_keys(config, params_in_file) for param in params_in_file: if not utils.find_in_dict(schema, param): #print ("Could not find in schema: ", param) if "." not in param: parameters_not_in_schema.add(param) print() print(config_file, ":") #print("parameters in config file: ", params_in_file ) print( "parameters not in schema: ", parameters_not_in_schema.difference( known_params_not_in_schema))