def process_blocklists(db_file): """ prompt for and process blocklists """ source = inquirer.ask_blocklist() import_list = [] if source in blockLists: url_source = blockLists[source] resp = requests.get(url_source["url"]) import_list = utils.process_lines(resp.text, url_source["comment"]) if source == constants.FILE: fname = inquirer.ask_import_file() import_file = open(fname) import_list = utils.process_lines(import_file, f"File: {fname}") if source == constants.PASTE: import_list = inquirer.ask_paste() import_list = utils.process_lines(import_list, "Pasted content") if len(import_list) == 0: utils.die("No valid urls found, try again") if not inquirer.confirm( f"Add {len(import_list)} block lists to {db_file}?"): utils.warn("Nothing changed. Bye!") sys.exit(0) conn = sqlite3.connect(db_file) sqldb = conn.cursor() added = 0 exists = 0 for item in import_list: sqldb.execute("SELECT COUNT(*) FROM adlist WHERE address = ?", (item["url"], )) cnt = sqldb.fetchone() if cnt[0] > 0: exists += 1 else: added += 1 vals = (item["url"], item["comment"]) sqldb.execute( "INSERT OR IGNORE INTO adlist (address, comment) VALUES (?,?)", vals) conn.commit() sqldb.close() conn.close() utils.success(f"{added} block lists added! {exists} already existed.")
def test_process_lines_full_url(self): comment = "MyComment" new_list = utils.process_lines( """ http://google.com invalid http://github.com """, comment, True, ) assert len(new_list) == 2 assert new_list[1]["url"] == "http://github.com" assert new_list[1]["comment"] == comment
def test_process_lines_any(self): comment = "MyComment" new_list = utils.process_lines( """ github github.com http://github.com http://github.com/test http://github.com/test?f08s """, comment, True, ) assert len(new_list) == 3 # assert new_list[1]["url"] == "http://github.com" assert new_list[1]["comment"] == comment
def process_whitelists(db_file): """ prompt for and process blacklists """ source = inquirer.ask_whitelist() import_list = [] if source in whiteLists: url_source = whiteLists[source] resp = requests.get(url_source['url']) import_list = utils.process_lines(resp.text, url_source['comment'], False) # This breaks if we add a new whitelist setup if source != ANUDEEP_WHITELIST: resp = requests.get(ANUDEEP_WHITELIST) import_list += utils.process_lines(resp.text, url_source['comment'], False) if source == constants.FILE: fname = inquirer.ask_import_file() import_file = open(fname) import_list = utils.process_lines(import_file.read(), f'File: {fname}', False) if source == constants.PASTE: import_list = inquirer.ask_paste() import_list = utils.process_lines(import_list, 'Pasted content', utils.validate_host) if len(import_list) == 0: utils.die('No valid urls found, try again') if not inquirer.confirm(f'Add {len(import_list)} white lists to {db_file}?'): utils.warn('Nothing changed. Bye!') sys.exit(0) conn = sqlite3.connect(db_file) sqldb = conn.cursor() added = 0 exists = 0 for item in import_list: sqldb.execute( "SELECT COUNT(*) FROM domainlist WHERE domain = ?", (item['url'],)) cnt = sqldb.fetchone() if cnt[0] > 0: exists += 1 else: # 0 = exact whitelist # 2 = regex whitelist domain_type = 0 if item['type'] == constants.REGEX: domain_type = 2 vals = (item['url'], domain_type, item['comment']) sqldb.execute( 'INSERT OR IGNORE INTO domainlist (domain, type, comment) VALUES (?,?,?)', vals) conn.commit() added += 1 sqldb.close() conn.close() utils.success(f'{added} whitelists added! {exists} already existed.')
def test_process_lines_empty(self): new_list = utils.process_lines("", "", True) assert len(new_list) == 0
def train(model, name): history_score = [] start_time = time.time() print 'epochs\tloss\ttrain-auc\teval-auc\ttime' sys.stdout.flush() for i in range(num_round): fetches = [model.optimizer, model.loss] if batch_size > 0: ls = [] f = open(train_file, 'r') while True: lines_gen = list(islice(f, batch_size * bb)) if not lines_gen: break for ib in range(bb): X_i, y_i = utils.slice(utils.process_lines(lines_gen[batch_size * ib : batch_size * (ib+1)]), 0, -1) _, l = model.run(fetches, X_i, y_i) ls.append(l) elif batch_size == -1: pass """ X_i, y_i = utils.slice(train_data) _, l = model.run(fetches, X_i, y_i) ls = [l] """ lst_train_pred = [] lst_test_pred = [] if batch_size > 0: f = open(train_file, 'r') while True: lines_gen = list(islice(f, batch_size * bb)) if not lines_gen: break for ib in range(bb): X_i, y_i = utils.slice(utils.process_lines(lines_gen[batch_size * ib : batch_size * (ib+1)]), 0, -1) _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) """ for j in range(train_size / batch_size + 1): X_i, y_i = utils.slice(train_data, j * batch_size, batch_size) #X_i = utils.libsvm_2_coo(X_i, (len(X_i), input_dim)).tocsr() _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) """ f = open(test_file, 'r') while True: lines_gen = list(islice(f, batch_size * bb)) if not lines_gen: break for ib in range(bb): X_i, y_i = utils.slice(utils.process_lines(lines_gen[batch_size * ib : batch_size * (ib+1)]), 0, -1) _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) """ for j in range(test_size / batch_size + 1): X_i, y_i = utils.slice(test_data, j * batch_size, batch_size) #X_i = utils.libsvm_2_coo(X_i, (len(X_i), input_dim)).tocsr() _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) """ train_preds = np.concatenate(lst_train_pred) test_preds = np.concatenate(lst_test_pred) train_score = roc_auc_score(train_label, train_preds) test_score = roc_auc_score(test_label, test_preds) print '%d\t%f\t%f\t%f\t%f\t%s' % (i, np.mean(ls), train_score, test_score, time.time() - start_time, strftime("%Y-%m-%d %H:%M:%S", gmtime())) path_model = 'model/' + str(name) + '_epoch_' + str(i) path_label_score = 'model/label_score_' + str(name) + '_epoch_' + str(i) #model.dump(path_model) d_label_score = {} d_label_score['label'] = test_label d_label_score['score'] = test_preds #pkl.dump(d_label_score, open(path_label_score, 'wb')) sys.stdout.flush() history_score.append(test_score) if i > min_round and i > early_stop_round: #if np.argmax(history_score) == i - early_stop_round and history_score[-1] - history_score[ # -1 * early_stop_round] < 1e-5: i_max = np.argmax(history_score) if i - i_max >= early_stop_round: print 'early stop\nbest iteration:\n[%d]\teval-auc: %f' % ( np.argmax(history_score), np.max(history_score)) sys.stdout.flush() break
def train(model, name, in_memory = True, flag_MTL = False): #builder = tf.saved_model.builder.SavedModelBuilder('model') global batch_size, time_run, time_read, time_process history_score = [] best_score = -1 best_epoch = -1 start_time = time.time() print 'epochs\tloss\ttrain-auc\teval-auc\ttime' sys.stdout.flush() if in_memory: train_data = utils.read_data(path_train, INPUT_DIM) validation_data = utils.read_data(path_validation, INPUT_DIM) test_data = utils.read_data(path_test, INPUT_DIM) model_name = name.split('_')[0] if model_name in set(['lr', 'fm']): train_data_tmp = utils.split_data(train_data, FIELD_OFFSETS) validation_data_tmp = utils.split_data(validation_data, FIELD_OFFSETS) test_data_tmp = utils.split_data(test_data, FIELD_OFFSETS) else: train_data = utils.split_data(train_data, FIELD_OFFSETS) validation_data = utils.split_data(validation_data, FIELD_OFFSETS) test_data = utils.split_data(test_data, FIELD_OFFSETS) for i in range(num_round): fetches = [model.optimizer, model.loss] if batch_size > 0: ls = [] if in_memory: for j in range(train_size / batch_size + 1): X_i, y_i = utils.slice(train_data, j * batch_size, batch_size) _, l = model.run(fetches, X_i, y_i) ls.append(l) else: f = open(path_train, 'r') lst_lines = [] for line in f: if len(lst_lines) < batch_size: lst_lines.append(line) else: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name, INPUT_DIM, FIELD_OFFSETS), 0, -1) # type of X_i, X_i[0], X_i[0][0] is list, tuple and np.ndarray respectively. _, l = model.run(fetches, X_i, y_i) ls.append(l) lst_lines = [line] f.close() if len(lst_lines) > 0: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name, INPUT_DIM, FIELD_OFFSETS), 0, -1) _, l = model.run(fetches, X_i, y_i) ls.append(l) elif batch_size == -1: pass model.dump('model/' + name + '_epoch_' + str(i)) if in_memory: lst_train_preds = [] lst_validation_preds = [] lst_test_preds = [] for j in range(train_size / batch_size + 1): X_i, y_i = utils.slice(train_data, j * batch_size, batch_size) p = model.run(model.y_prob, X_i, y_i) lst_train_preds.append(p) for j in range(validation_size / batch_size + 1): X_i, y_i = utils.slice(validation_data, j * batch_size, batch_size) p = model.run(model.y_prob, X_i, y_i) lst_validation_preds.append(p) for j in range(test_size / batch_size + 1): X_i, y_i = utils.slice(test_data, j * batch_size, batch_size) p = model.run(model.y_prob, X_i, y_i) lst_test_preds.append(p) train_preds = np.concatenate(lst_train_preds) validation_preds = np.concatenate(lst_validation_preds) test_preds = np.concatenate(lst_test_preds) #train_preds = model.run(model.y_prob, utils.slice(train_data)[0]) #test_preds = model.run(model.y_prob, utils.slice(test_data)[0]) train_score = roc_auc_score(train_data[1], train_preds) validation_score = roc_auc_score(validation_data[1], validation_preds) test_score = roc_auc_score(test_data[1], test_preds) train_score_sum = 0 train_score_weight = 0 validation_score_sum = 0 validation_score_weight = 0 test_score_sum = 0 test_score_weight = 0 #print '[%d]\tloss:%f\ttrain-auc: %f\teval-auc: %f' % (i, np.mean(ls), train_score, test_score) print '%d\t%f\t%f\t%f\t%f\t%f\t%s' % (i, np.mean(ls), train_score, validation_score, test_score, time.time() - start_time, strftime("%Y-%m-%d %H:%M:%S", gmtime())) if flag_MTL: d_index_task_label_pred_train = {} d_index_task_label_pred_validation = {} d_index_task_label_pred_test = {} if model_name in set(['lr', 'fm']): index_task_train = train_data_tmp[0][-1].indices index_task_validation = validation_data_tmp[0][-1].indices index_task_test = test_data_tmp[0][-1].indices else: index_task_train = train_data[0][-1].indices index_task_validation = validation_data[0][-1].indices index_task_test = test_data[0][-1].indices for index_tmp in range(len(index_task_train)): index_task = index_task_train[index_tmp] d_index_task_label_pred_train.setdefault(index_task, [[],[]]) d_index_task_label_pred_train[index_task][0].append(train_data[1][index_tmp]) d_index_task_label_pred_train[index_task][1].append(train_preds[index_tmp]) for index_task in sorted(list(set(index_task_train))): auc = roc_auc_score(d_index_task_label_pred_train[index_task][0], d_index_task_label_pred_train[index_task][1]) num_samples = len(d_index_task_label_pred_train[index_task][0]) train_score_sum += auc * num_samples train_score_weight += num_samples print 'train, index_type: %d, number of samples: %d, AUC: %f' % (index_task, len(d_index_task_label_pred_train[index_task][0]), auc) for index_tmp in range(len(index_task_validation)): index_task = index_task_validation[index_tmp] d_index_task_label_pred_validation.setdefault(index_task, [[],[]]) d_index_task_label_pred_validation[index_task][0].append(validation_data[1][index_tmp]) d_index_task_label_pred_validation[index_task][1].append(validation_preds[index_tmp]) for index_task in sorted(list(set(index_task_validation))): auc = roc_auc_score(d_index_task_label_pred_validation[index_task][0], d_index_task_label_pred_validation[index_task][1]) num_samples = len(d_index_task_label_pred_validation[index_task][0]) validation_score_sum += auc * num_samples validation_score_weight += num_samples print 'validation, index_type: %d, number of samples: %d, AUC: %f' % (index_task, num_samples, auc) for index_tmp in range(len(index_task_test)): index_task = index_task_test[index_tmp] d_index_task_label_pred_test.setdefault(index_task, [[],[]]) d_index_task_label_pred_test[index_task][0].append(test_data[1][index_tmp]) d_index_task_label_pred_test[index_task][1].append(test_preds[index_tmp]) for index_task in sorted(list(set(index_task_test))): auc = roc_auc_score(d_index_task_label_pred_test[index_task][0], d_index_task_label_pred_test[index_task][1]) num_samples = len(d_index_task_label_pred_test[index_task][0]) test_score_sum += auc * num_samples test_score_weight += num_samples print 'test, index_type: %d, number of samples: %d, AUC: %f' % (index_task, len(d_index_task_label_pred_test[index_task][0]), auc) weighted_train_score = train_score_sum / train_score_weight print 'weighted_train_score', weighted_train_score weighted_validation_score = validation_score_sum / validation_score_weight print 'weighted_validation_score', weighted_validation_score weighted_test_score = test_score_sum / test_score_weight print 'weighted_test_score', weighted_test_score history_score.append(weighted_validation_score) if weighted_validation_score < best_score and (i - best_epoch) >= 3: break if weighted_validation_score > best_score: best_score = weighted_validation_score best_epoch = i sys.stdout.flush() else: lst_train_pred = [] lst_test_pred = [] if batch_size > 0: f = open(path_train, 'r') lst_lines = [] for line in f: if len(lst_lines) < batch_size: lst_lines.append(line) else: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name, INPUT_DIM, FIELD_OFFSETS), 0, -1) _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) lst_lines = [line] f.close() if len(lst_lines) > 0: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name, INPUT_DIM, FIELD_OFFSETS), 0, -1) _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) f = open(path_test, 'r') lst_lines = [] for line in f: if len(lst_lines) < batch_size: lst_lines.append(line) else: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name, INPUT_DIM, FIELD_OFFSETS), 0, -1) _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) lst_lines = [line] f.close() if len(lst_lines) > 0: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name, INPUT_DIM, FIELD_OFFSETS), 0, -1) _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) train_preds = np.concatenate(lst_train_pred) test_preds = np.concatenate(lst_test_pred) print 'np.shape(train_preds)', np.shape(train_preds) train_score = roc_auc_score(train_label, train_preds) test_score = roc_auc_score(test_label, test_preds) print '%d\t%f\t%f\t%f\t%f\t%s' % (i, np.mean(ls), train_score, test_score, time.time() - start_time, strftime("%Y-%m-%d %H:%M:%S", gmtime())) sys.stdout.flush() '''
def do_inference(hostport, test_data, concurrency, num_tests, batch_size): """Tests PredictionService with concurrent-batched requests. Args: hostport: Host:port address of the PredictionService. test_data: The full path to the test data set. concurrency: Maximum number of concurrent requests. num_tests: Number of test tensors to use. batch_size: Number of tests to include in each query Returns: The results of the queries Raises: IOError: An error occurred processing test data set. """ host, port = hostport.split(':') channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) result_counter = _ResultCounter(concurrency) num_field = 15 with open(test_data, 'r') as f: data = f.read().split('\n') requests = [] t0 = time.time() #Create batches of requests for i in range(num_batches): data_batch = data[:batch_size] data = data[batch_size:] index_list = [[] for _ in range(num_field)] values_list = [[] for _ in range(num_field)] for j in range(batch_size): X, y = utils.slice( utils.process_lines([data_batch[j]], 'fwfm', INPUT_DIM, FIELD_OFFSETS), 0, -1) for idx in range(num_field): index_list[idx].append(X[idx][0].tolist()) values_list[idx].append(1) requests.append(predict_pb2.PredictRequest()) requests[i].model_spec.name = 'serve' requests[i].model_spec.signature_name = 'model' requests[i].output_filter.append('outputs') for idx in range(num_field): requests[i].inputs["field_" + str(idx) + "_values"].CopyFrom( tf.contrib.util.make_tensor_proto( values_list[idx], shape=[len(values_list[idx])], dtype=tf.int64)) requests[i].inputs["field_" + str(idx) + "_indices"].CopyFrom( tf.contrib.util.make_tensor_proto( index_list[idx], shape=[len(index_list[idx]), 2], dtype=tf.float32)) requests[i].inputs["field_" + str(idx) + "_dense_shape"].CopyFrom( tf.contrib.util.make_tensor_proto( [batch_size, total_features], shape=[2], dtype=tf.int64)) t1 = time.time() #Query server for i in range(num_batches): result_counter.throttle() result = stub.Predict.future(requests[i], 100.0) # 100 secs timeout result.add_done_callback(_create_rpc_callback(i, result_counter)) t2 = time.time() #Synchronize on comleted queries result_counter.get_complete() t3 = time.time() full_results = [] for values in results: full_results.extend(values) print("Elapsed time for ", num_tests, " request creations: ", (t1 - t0)) print("Elapsed time for ", num_batches, " batch submissions: ", (t2 - t1)) print("Elapsed time for ", num_tests, " inferences: ", (t3 - t1)) return full_results
def train(model, name): global batch_size global time_run global time_process history_score = [] start_time = time.time() print 'epochs\tloss\ttrain-auc\teval-auc\ttime' sys.stdout.flush() for i in range(num_round): fetches = [model.optimizer, model.loss] if batch_size > 0: ls = [] for j in range(train_size / batch_size + 1): start_process = time.time() X_i, y_i = utils.slice(train_data, j * batch_size, batch_size) time_process += time.time() - start_process start_run = time.time() _, l = model.run(fetches, X_i, y_i) time_run += time.time() - start_run ls.append(l) ''' f = open(train_file, 'r') lst_lines = [] for line in f: if len(lst_lines) < batch_size: lst_lines.append(line) else: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name), 0, -1) print 'type(X_i)', type(X_i) print 'type(X_i[0])', type(X_i[0]) _, l = model.run(fetches, X_i, y_i) ls.append(l) lst_lines = [line] f.close() if len(lst_lines) > 0: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name), 0, -1) _, l = model.run(fetches, X_i, y_i) ls.append(l) ''' ''' while True: lines_gen = list(islice(f, batch_size * bb)) if not lines_gen: break for ib in range(bb): X_i, y_i = utils.slice(utils.process_lines(lines_gen[batch_size * ib : batch_size * (ib+1)], name), 0, -1) _, l = model.run(fetches, X_i, y_i) ls.append(l) ''' elif batch_size == -1: pass """ X_i, y_i = utils.slice(train_data) _, l = model.run(fetches, X_i, y_i) ls = [l] """ lst_train_pred = [] lst_test_pred = [] if batch_size > 0: f = open(train_file, 'r') lst_lines = [] for line in f: if len(lst_lines) < batch_size: lst_lines.append(line) else: X_i, y_i = utils.slice( utils.process_lines(lst_lines, name), 0, -1) _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) lst_lines = [line] f.close() if len(lst_lines) > 0: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name), 0, -1) _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) ''' while True: lines_gen = list(islice(f, batch_size * bb)) if not lines_gen: break for ib in range(bb): X_i, y_i = utils.slice(utils.process_lines(lines_gen[batch_size * ib : batch_size * (ib+1)], name), 0, -1) _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) ''' """ for j in range(train_size / batch_size + 1): X_i, y_i = utils.slice(train_data, j * batch_size, batch_size) #X_i = utils.libsvm_2_coo(X_i, (len(X_i), input_dim)).tocsr() _train_preds = model.run(model.y_prob, X_i) lst_train_pred.append(_train_preds) """ f = open(test_file, 'r') lst_lines = [] for line in f: if len(lst_lines) < batch_size: lst_lines.append(line) else: X_i, y_i = utils.slice( utils.process_lines(lst_lines, name), 0, -1) _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) lst_lines = [line] f.close() if len(lst_lines) > 0: X_i, y_i = utils.slice(utils.process_lines(lst_lines, name), 0, -1) _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) ''' while True: lines_gen = list(islice(f, batch_size * bb)) if not lines_gen: break for ib in range(bb): X_i, y_i = utils.slice(utils.process_lines(lines_gen[batch_size * ib : batch_size * (ib+1)], name), 0, -1) _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) ''' """ for j in range(test_size / batch_size + 1): X_i, y_i = utils.slice(test_data, j * batch_size, batch_size) #X_i = utils.libsvm_2_coo(X_i, (len(X_i), input_dim)).tocsr() _test_preds = model.run(model.y_prob, X_i) lst_test_pred.append(_test_preds) """ train_preds = np.concatenate(lst_train_pred) test_preds = np.concatenate(lst_test_pred) train_score = roc_auc_score(train_label, train_preds) test_score = roc_auc_score(test_label, test_preds) print '%d\t%f\t%f\t%f\t%f\t%s' % ( i, np.mean(ls), train_score, test_score, time.time() - start_time, strftime("%Y-%m-%d %H:%M:%S", gmtime())) print 'time_run', time_run print 'time_process', time_process # Save the model to local files ''' path_model = 'model/' + str(name) + '_epoch_' + str(i) model.dump(path_model) d_label_score = {} d_label_score['label'] = test_label d_label_score['score'] = test_preds #path_label_score = 'model/label_score_' + str(name) + '_epoch_' + str(i) #pkl.dump(d_label_score, open(path_label_score, 'wb')) sys.stdout.flush() ''' history_score.append(test_score) if i > min_round and i > early_stop_round: i_max = np.argmax(history_score) # Early stop if i - i_max >= early_stop_round: print 'early stop\nbest iteration:\n[%d]\teval-auc: %f' % ( np.argmax(history_score), np.max(history_score)) sys.stdout.flush() break '''