def update_db(): today = pd.datetime.now(pytz.timezone('US/Eastern')) last = None with open('bpats_db/LASTUD', 'r') as f: last = pd.datetime(*map(int, f.read().split('-'))) + timedelta(1) if today.hour < 9 and today.hour > 4: today = today - timedelta(1) else: today = today - timedelta(2) today = pd.datetime(today.year, today.month, today.day) if today - last < timedelta(1): return -1 for scode in scodes: data = None print(scode, end=' ') try: data = tools.get_data(scode, start=last, end=today) except iexfinance.utils.exceptions.IEXSymbolError: continue except: print('update {} failed, {} -> {}'.format( scode, '{}-{}-{}'.format(last.year, last.month, last.day), '{}-{}-{}'.format(today.year, today.month, today.day))) continue o = data.Adj_Open c = data.Adj_Close h = data.Adj_High l = data.Adj_Low v = data.Adj_Volume zm = BPAT(o, c, h, l, v) with open('bpats_db/' + scode, 'a') as f: for b in zm: f.write(b + ' ') with open('bpats_db/LASTUD', 'w') as f: f.write('{}-{}-{}'.format(today.year, today.month, today.day))
def main(args): scores, roc_aucs = [], [] for i in range(5): seed = i set_seed(seed) (_, _), X, (train_idx, train_y), (val_idx, val_y), (test_idx, test_y), names = tools.get_data( args.__dict__, seed=seed) if X is None or not X.shape[1]: raise ValueError('No features') clf = GaussianNB() clf.fit(X[train_idx], train_y) probs = clf.predict(X[test_idx]) roc_auc = roc_auc_score(test_y, probs) roc_aucs.append(roc_auc) preds = (probs > 0.5) * 1 score = acc(preds, test_y) print('Score:', score) scores.append(score) print('Acc(all):', scores) print('Auc(all):', roc_aucs) print('Accuracy:', np.mean(scores)) print('Auc:', np.mean(roc_aucs)) return np.mean(roc_aucs), np.std(roc_aucs)
def main(args): roc_aucs = [] for i in range(args.n_runs): seed = i set_seed(seed) (_, _), X, (train_idx, train_y), (val_idx, val_y), (test_idx, test_y), names = tools.get_data( args.__dict__, seed=seed) if X is None or not X.shape[1]: raise ValueError('No features') clf = sklearn.svm.SVC(class_weight='balanced', random_state=seed, probability=True) clf.fit(X[train_idx], train_y) probs = clf.predict_proba(X[test_idx])[:, 1] roc_auc = roc_auc_score(test_y, probs) roc_aucs.append(roc_auc) p = np.stack([names[test_idx], probs], axis=1) save_preds(p, args, seed) print('Auc(all):', roc_aucs) print('Auc:', np.mean(roc_aucs)) return np.mean(roc_aucs), np.std(roc_aucs)
def train(param=PARAMS, sv=SOLVE, small=False): num_hidden = 4 num_lstm_layer = 1 batch_size = 1 def sym_gen(seq_len): return lstm_unroll(num_lstm_layer, seq_len, num_hidden=num_hidden, num_label=1) init_c = [('l%d_init_c'%l, (batch_size, num_hidden, 256, 256)) for l in range(num_lstm_layer)] init_h = [('l%d_init_h'%l, (batch_size, num_hidden, 256, 256)) for l in range(num_lstm_layer)] init_states = init_c + init_h data_train, data_val = get_data('r', batch_size, init_states=init_states, small=small) #data = get(init_states, bs=batch_size, small=small) #data_train = data['train'] #data_val = data['val'] param['eval_data'] = data_val num_time = data_train.data_list[0].shape[1] symbol = sym_gen(num_time) s = Solver(symbol, data_train, sv, **param) print 'Start Training...' s.train()
def train(param=PARAMS, sv=SOLVE, small=False): num_hidden = 4 num_lstm_layer = 1 batch_size = 1 def sym_gen(seq_len): return lstm_unroll(num_lstm_layer, seq_len, num_hidden=num_hidden, num_label=1) init_c = [('l%d_init_c' % l, (batch_size, num_hidden, 256, 256)) for l in range(num_lstm_layer)] init_h = [('l%d_init_h' % l, (batch_size, num_hidden, 256, 256)) for l in range(num_lstm_layer)] init_states = init_c + init_h data_train, data_val = get_data('r', batch_size, init_states=init_states, small=small) #data = get(init_states, bs=batch_size, small=small) #data_train = data['train'] #data_val = data['val'] param['eval_data'] = data_val num_time = data_train.data_list[0].shape[1] symbol = sym_gen(num_time) s = Solver(symbol, data_train, sv, **param) print 'Start Training...' s.train()
def train_model(ui): # get filename from customer if f = ui.get_file_name() == None return None # validate date in filename raw_data = tools.get_data(f) if not tools.raw_data_is_valid(raw_data): ui.print_error("Raw data file {0} not formatted correctly".format(f)) return # derive new features, handle missing data, and clean clean_data = preprocessing.format_data(raw_data) # store new data into database? database.store_clean_data(clean_data) # run model model = mlalgorithms.logistic_regression(new_data) # store model in database database.store_model(model) # display performance? ui.display_performance(model)
def main(args): roc_aucs = [] for i in range(args.n_runs): seed = i set_seed(seed) _, X, (train_idx, train_y), (val_idx, val_y), (test_idx, test_y), names = tools.get_data(args.__dict__, seed=seed) if X is None or not X.shape[1]: raise ValueError('No features') train_x = X[train_idx].cuda() val_x = X[val_idx].cuda() test_x = X[test_idx].cuda() print('train_x', train_x.mean()) print('test_x', test_x.mean()) probs = mlp_fit_predict(train_x, train_y, test_x, val=(val_x, val_y)) roc_auc = roc_auc_score(test_y, probs) roc_aucs.append(roc_auc) p = np.concatenate( [names[test_idx].reshape(-1, 1), probs.reshape(-1, 1)], axis=1) save_preds(p, args, seed) print('Auc(all):', roc_aucs) print('Auc:', np.mean(roc_aucs)) return np.mean(roc_aucs), np.std(roc_aucs)
def request_asset_list(): r = get_data( "asset?columns=ASSET_DATABASE_ID&columns=TYPE&columns=LABEL&columns=LAST_CLOSE_VALUE_IN_CURR", start_date='2013-06-14', end_date='2019-04-18') dic_asset = json.loads(r) return dic_asset
def tsub(val): s = re.match(r'!([a-zA-Z][a-zA-Z0-9_]*)\s*([a-zA-Z][a-zA-Z0-9_]*)?(.*)', t_dbg.text) if s: cmd = s.groups()[0] arg = s.groups()[1] oth = s.groups()[2] if cmd == 'target': #G_var['DATAF'] = quandl.get('EOD/'+arg)[['Adj_Open','Adj_High','Adj_Low','Adj_Close','Adj_Volume']].iloc[-10:] G_var['DATAF'] = tools.get_data(arg).iloc[-10:] G_var['name'] = arg if oth: G_var['pat_N'] = int(oth) pat_proc(int(oth)) else: pat_proc() elif cmd == 'go': go() else: try: print(eval(t_dbg.text)) except: exec(t_dbg.text) t_dbg.set_val('')
def main(args): scores, roc_aucs = [], [] for i in range(5): seed = i set_seed(seed) (_, _), X, (train_idx, train_y), (val_idx, val_y), (test_idx, test_y), names = tools.get_data( args.__dict__, seed=seed) if X is None or not X.shape[1]: raise ValueError('No features') clf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced', random_state=seed, n_estimators=500) clf.fit(X[train_idx], train_y) probs = clf.predict(X[test_idx]) roc_auc = roc_auc_score(test_y, probs) roc_aucs.append(roc_auc) preds = (probs > 0.5) * 1 score = acc(preds, test_y) print('Score:', score) scores.append(score) print('Acc(all):', scores) print('Auc(all):', roc_aucs) print('Accuracy:', np.mean(scores)) print('Auc:', np.mean(roc_aucs)) return np.mean(roc_aucs), np.std(roc_aucs)
def run_classifiers(rabbit_list, rabbit_dict): print('===> Training and evaluating predictive models ... ') for i, r in enumerate(rabbit_list): print(f'=> Suject {r}: Generating training data ... ', end='') #### Generate the data for training and testing the classifier models #### train_feats, train_labels, test_feats, test_labels = tls.get_data(['ctd', 'max', 't2', 'adc']) temp_label = train_labels.copy() temp_feats = train_feats.copy() rabbit_dict[r]['test_labels'] = test_labels[i].clone() test_feats = test_feats[i].clone() _ = temp_label.pop(i) _ = temp_feats.pop(i) rabbit_dict[r]['train_labels'] = torch.cat(temp_label, 0) train_feats = torch.cat(temp_feats, 0) # Get the mean of the training features train_mean_feats = train_feats.mean(0, keepdims=True) train_std_feats = train_feats.std(0, keepdim=True) # Normalize the test and train features by the training mean # train_feats = train_feats / train_mean_feats # test_feats = test_feats / train_mean_feats train_feats = (train_feats - train_mean_feats) / train_std_feats test_feats = (test_feats - train_mean_feats) / train_std_feats # Store the data in the dictionary for later use rabbit_dict[r]['train_features'] = train_feats.reshape(train_feats.shape[0], -1).squeeze() rabbit_dict[r]['test_features'] = test_feats.reshape(test_feats.shape[0], -1).squeeze() print('done') print(f'=> Suject {r}: Training and evaluating logistic regression classifier ... ', end='') # Train and eval the logistic regression classifier rabbit_dict[r]['logistic_model'] = {} temp_train_proba, temp_test_proba = tls.logistic_regression(rabbit_dict[r]['train_features'], rabbit_dict[r]['train_labels'], rabbit_dict[r]['test_features']) rabbit_dict[r]['logistic_model']['train_proba'] = temp_train_proba rabbit_dict[r]['logistic_model']['test_proba'] = temp_test_proba rabbit_dict[r]['logistic_model']['test_proba_vol'] = recon_prediction(temp_test_proba, rabbit=r) print('done') print(f'=> Suject {r}: Training and evaluating random forest classifier ... ', end='') # Train and eval the logistic regression classifier rabbit_dict[r]['forest_model'] = {} temp_train_proba, temp_test_proba = tls.random_forest(rabbit_dict[r]['train_features'], rabbit_dict[r]['train_labels'], rabbit_dict[r]['test_features']) rabbit_dict[r]['forest_model']['train_proba'] = temp_train_proba rabbit_dict[r]['forest_model']['test_proba'] = temp_test_proba rabbit_dict[r]['forest_model']['test_proba_vol'] = recon_prediction(temp_test_proba, rabbit=r) print('done') print('===> Done training and evaluating predictive models.')
def main(): #(2560, 256, 123) X, Y = get_data() # model = DNN(X, Y) # model = DCNN(X, Y) # model = ANN(X, Y) model = CNN(X, Y) model.cnn() print('Done')
def main(args): seed = np.random.randint(0, 1000000) _, X, (idx1, y1), (idx2, y2), (idx3, y3), names = tools.get_data(args.__dict__, seed=seed) idx = np.concatenate([idx1, idx2, idx3], 0) y = np.concatenate([y1, y2, y3], 0) X = X[idx] feats, cors = cor_selector(X, y, 50) print(cors)
def get_company_info(company_name, index): dr = get_chrome(index) company_data = tools.get_data( "select company_name from crm_clue_qichacha where company_name = '%s'" % company_name) if company_data is not None and len(company_data) > 0: tools.update_sql( "update 1688_clue_new set 1688_consume =1 where company_name = '%s'" % company_name) print '%s is exists in db' % company_name else: infos = getShopInfo(dr, tools.str_process(company_name)) insert_into_db(infos, company_name)
def calculate_entropy(name): """ """ historic_probability = get_data(name) historic_entropy = [] for probability in historic_probability: s = 0 N = int(probability[len(probability) - 1]) for i in range(len(probability) - 1): s += shanon_entropy(float(probability[i]) / N) historic_entropy.append(s) return historic_entropy, math.log(N)
def load_to_f(): i = 0 for scode in scodes: if os.path.isfile('bpats_db/' + scode): i += 1 continue data = tools.get_data(scode) o = data.Adj_Open c = data.Adj_Close h = data.Adj_High l = data.Adj_Low v = data.Adj_Volume save(BPAT(o, c, h, l, v), scode) i += 1 print('{}/{}'.format(i, len(scodes)), end=' ')
def cf_train(sv=SOLVE, param=PARAMS): train, val = get_data('c', 2, small=False) sv['name'] = 'CF' sv['is_rnn'] = False param['eval_data'] = val param['num_epoch'] = 20 param['learning_rate'] = 0.1 print 'SOLVE', sv print 'param', param s = Solver(net, train, sv, **param) s.train() # s.predict() return s
def publish(request, partner_id, click_id, **kwargs): check = kwargs.get('check', False) logger = S2SFactory.get_logger() s2s_data = get_data(partner_id=partner_id,logger=logger,settings=settings) if not s2s_data: logger.info('PARTNER [%s] INFO NOT FOUND' % partner_id) return HttpResponse('KO') if check: task_id = PublishTask.delay(partner_id, s2s_data, click_id, MockSender, DictMessage, settings) logger.info('CHECK QUEUED [%s]' % task_id) task_id.wait(interval=0.1) logger.info('CHECK RESULT [%s]: %s' % (task_id, task_id.result)) return HttpResponse(task_id.result) else: task_id = PublishTask.delay(partner_id, s2s_data, click_id, SMTPSender, MIMEBase64, settings) logger.info('NOTIFICATION QUEUED %s' % (task_id)) return HttpResponse(task_id,status=202)
def get_data(): data = tools.get_data() data_old = data[1] data_new = data[0] return jsonify({ "confirm": data_new[1], "confirm_add": data_new[2], "suspect": data_new[3], "suspect_add": data_new[4], "heal": data_new[5], "heal_add": data_new[6], "dead": data_new[7], "dead_add": data_new[8], "now_confirm": data_new[9], "now_confirm_add": data_new[9] - data_old[9], "now_severe": data_new[10], "now_severe_add": data_new[10] - data_old[10] })
def train(batch_size, param=PARAMS, sv=SOLVE, small=False): num_lstm_layer = 1 num_hidden = 1000 # prepare data init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] init_states = init_c + init_h data_train, data_val = get_data('r', batch_size, init_states=init_states, small=small, splite_rate=0.2) param['eval_data'] = data_val # prepare symbol num_time = data_train.data_list[0].shape[1] symbol = lstm_unroll(num_lstm_layer, num_time, num_hidden) s = Solver(symbol, data_train, sv, **param) print 'Start Training...' s.train()
def from_1688_company(): while True: index = 0 chrome_len = len(list_dr) company_names = tools.get_data( "select company_name,1688_result,id from test.`1688_clue_new` where id not in (select p_id from crm_process_company where status = 0 and source = '%s') limit 1000" % source) for company_name in company_names: try: get_company_info(company_name[0], index % chrome_len) tools.update_sql( "insert into crm_process_company(p_id,status,source,updated_at) values(%d,1,'%s','%s') ON DUPLICATE KEY UPDATE status = 1,updated_at='%s'" % (int(company_name[2]), source, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) time.sleep(random.uniform(1, 10)) except: print 'traceback.format_exc():\n%s' % traceback.format_exc() index += 1 time.sleep(3600)
def main(): #conn = sqlite3.connect('../ping-analytics.db') output = get_data() ping = [] ping = get_ms(output, ping) avg_ping = round(cal_average(ping), 1) est, mdt, pst = get_time_by_timezone() #db_main(est, mdt, pst, avg_ping, conn) insert_data(est, mdt, pst, avg_ping, ip) #logging print(f'Time: {est}') print(f'Data: \n {output}') print(f'\n\nPing: {avg_ping}') threading.Timer(wait_seconds, main).start()
def main(): orig = pd.read_csv(FILE_PATH, delimiter=DELIMITER) print(orig.shape) data_fixed = orig.copy() ratio_unknown = 0.5 ages = range(0, 130, 10) fix_field(data_fixed, 'age', ['[{}-{})'.format(age, age + 10) for age in ages], [i for i in range(len(ages))]) remove_unknown_with_ratio(data_fixed, ratio_unknown) fix_all_non_numeric(data_fixed) replace_unknown_with_average(data_fixed) class_key = 'gender' data_fixed = same_distribution(data_fixed, class_key) data_fixed = data_fixed.sample(n=2000) data, cluster_gt = get_data(data_fixed, to_keep, class_key) internal_clustering_loss = [metrics.silhouette_score] external_clustering_loss = [metrics.mutual_info_score] reduction_algorithms = [manifold.TSNE] pre_clustering_reduction_algorithm = decomposition.PCA(15) clusters_algos = CLUSTERING_ALGOS internal_clustering_loss = [] reduction_algorithms = [] if pre_clustering_reduction_algorithm: data = pre_clustering_reduction_algorithm.fit_transform(data) clusters_params_zip = [(algo, CLUSTERING_TO_STEPS_MAP_SMALL[algo]) for algo in clusters_algos] file_name, all_clustering_algorithms = run_full_flow( data, cluster_gt, clusters_params_zip, internal_clustering_loss, external_clustering_loss, reduction_algorithms ) print(file_name) run_plot_by_loss(file_name)
def train(batch_size, param=PARAMS, sv=SOLVE, small=False): num_lstm_layer = 1 num_hidden = 1000 # prepare data init_c = [('l%d_init_c' % l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] init_h = [('l%d_init_h' % l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] init_states = init_c + init_h data_train, data_val = get_data('r', batch_size, init_states=init_states, small=small, splite_rate=0.2) param['eval_data'] = data_val # prepare symbol num_time = data_train.data_list[0].shape[1] symbol = lstm_unroll(num_lstm_layer, num_time, num_hidden) s = Solver(symbol, data_train, sv, **param) print 'Start Training...' s.train()
def cross_validation_demo(): sub_sample = True y_train, x_train, ids_train, y_test_X, x_test_X, ids_test_X = get_data( sub_sample, large=True) seed = 1 k_fold = 4 lambdas = np.logspace(-4, 1, 30) k_indices = build_k_indices(y_train, k_fold, seed) # define lists to store the loss of training data and test data rmse_tr = [] rmse_te = [] W = [] I = [] for l in lambdas: M_rmse_tr = [] M_rmse_te = [] weight = [] index1 = [] for k in range(k_fold): loss_tr, loss_te, w, indices = cross_validation( y_train, x_train, k_indices, k, l) weight.append(w) index1.extend(indices) M_rmse_tr.append(loss_tr) M_rmse_te.append(loss_te) W.append(weight[np.argmin(M_rmse_te)]) I.extend(index1) rmse_tr.append(np.mean(M_rmse_tr)) rmse_te.append(np.mean(M_rmse_te)) plt.hist(I, bins=np.arange(min(I), max(I) + 1)) plt.title("Frequency diagram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show() plt.savefig("freq") """
logging.info("Epoch {} took {}".format(e, end - start_epoch)) logging.info("Epoch:{} ValidAccPi loss:{}".format( e, acc_ploss / it)) logging.info("Epoch:{} ValidAccValue loss: {}".format( e, acc_vloss / it)) logging.info("Epoch:{} ValidTotal loss:{}".format(e, acc_loss / it)) logging.info("Epoch:{} checkpoint".format(new_param_file)) torch.save({ 'state_dict': model.state_dict(), }, new_param_file) # saving new parameters logging.info("Training took {}".format(end - start_train)) logging.info("Saving model to {}".format(new_param_file)) torch.save({ 'state_dict': model.state_dict(), }, new_param_file) logging.info("####################END#################") return True if __name__ == "__main__": param_file = tools.get_params() new_param_file = tools.get_new_params() data_files = tools.get_data() if train(param_file, new_param_file, data_files): print("New model parameters were saved to {}".format(new_param_file)) else: print("Training failed, check for errors {}/train_{}.log".format( LOG_PATH, new_param_file))
def main(): try: size = sys.argv[1] except Exception as e: print e print '\n\tusage: python %s <small|all>\n' % sys.argv[0] exit() # hyperparameter topN_tfidf_words = 20 train_notes, train_outcomes = get_data('train', size) test_notes, test_outcomes = get_data('test', size) # max number of documents (this is used during vectorization) num_docs = max(map(len, train_notes.values())) # extract feature lists train_text_features, df = extract_features_from_notes(train_notes, topN_tfidf_words, 'embeddings', df=None) test_text_features, df_ = extract_features_from_notes(test_notes, topN_tfidf_words, 'embeddings', df=df) assert df == df_ # Fit model for each prediction task tasks = [ 'ethnicity', 'age', 'admission_type', 'hosp_expire_flag', 'gender', 'los', 'diagnosis' ] #tasks = ['diagnosis'] for task in tasks: print 'task:', task ### Train model # extract appropriate data train_Y, criteria = filter_task(train_outcomes, task, per_task_criteria=None) train_ids = sorted(train_Y.keys()) print 'train examples:', len(train_Y) # vecotrize notes train_X = vectorize_X(train_ids, train_text_features, num_docs=num_docs) print 'num_features: ', train_X.shape[1], '\n' train_Y = vectorize_Y(train_ids, train_Y, criteria) num_tags = train_Y.shape[1] # build model lstm_model = create_lstm_model(num_docs, num_tags, train_X, train_Y) lstm_model.summary() # test data test_labels, _ = filter_task(test_outcomes, task, per_task_criteria=criteria) test_ids = sorted(test_labels.keys()) test_X = vectorize_X(test_ids, test_text_features, num_docs=num_docs) test_Y = vectorize_Y(test_ids, test_labels, criteria) # fit model # fit model filepath = "/tmp/weights-%d.best.hdf5" % random.randint(0, 10000) save_best = SaveBestCallback(filepath) lstm_model.fit(train_X, train_Y, epochs=100, verbose=1, batch_size=32, validation_data=(test_X, test_Y), callbacks=[save_best]) lstm_model.load_weights(filepath) os.remove(filepath) model = (criteria, num_docs, lstm_model) ### Evaluation with io.StringIO() as out_f: # analysis pass # eval on test data results_onehot_keras(model, train_ids, train_X, train_Y, 'TRAIN', task, out_f) results_onehot_keras(model, test_ids, test_X, test_Y, 'TEST', task, out_f) output = out_f.getvalue() print output # error analysis error_analysis(model, test_ids, test_notes, test_text_features, test_X, test_Y, 'TEST', task) # serialize trained model homedir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) modelname = '%s/models/rnn_%s_%s.model' % (homedir, size, task) M = { 'criteria': criteria, 'num_docs': num_docs, 'model': lstm_pickle(lstm_model), 'output': output } with open(modelname, 'wb') as f: pickle.dump(M, f)
import os import itertools from keras.callbacks import EarlyStopping from sklearn.metrics import f1_score import tools training_data, training_label, validation_data, validation_label, validation_cate_label = tools.get_data( ) kernel_size = [5] num_layers = [10] batch_size = [30] learning_rate = [0.0001] overflow_model = 0 def run(bs, path, lr, ks, num_layer): fold = 1 for X_train, Y_train, X_val, Y_val, val_cat in zip(training_data, training_label, validation_data, validation_label, validation_cate_label): print("Fold " + str(fold)) model = tools.create_model(lr, bs, ks, num_layer) inner_path = path + "/fold_" + str(fold) if not os.path.exists(inner_path): os.makedirs(inner_path) early_stop = EarlyStopping(patience=20) history = model.fit(x=X_train,
import pyspark.sql.functions as f import sys import unidecode from nltk.tokenize import wordpunct_tokenize import tools start_time = time.time() slug = sys.argv[1] data_folder = '../data/' spark = tools.get_spark() sparkContext = spark.sparkContext data = tools.get_data(spark, slug) ops = tools.get_opinions(spark, data) # Transform 'word5' into 'word' ops = ops.withColumn( "text", f.regexp_replace("text", "(\d+|\D+)", " $1") ) # Transforms word 'the_' into 'the' ops = ops.withColumn( "text", f.regexp_replace("text", "(\w+[^\\w{_}]|\w+)", " $1") )
dataset_id = str(sys.argv[7]) sigma = float(sys.argv[8]) plt_freqs = sys.argv[9] dataset_id2 = str(sys.argv[10]) # A-Team positions CasA=[350.866417,58.811778] CygA=[299.868153,40.733916] VirA=[187.705930,12.391123] min_sep=0. # The absolute minimum allowed separation from the A-Team source, set to zero to enable the code to work independently ###################### MAIN SCRIPT ###################### # Extracting data from the TraP database into a text file if not os.path.exists('ds_'+dataset_id+'_images.csv'): tools.get_data(database, username, password, host, port, databaseType, dataset_id, dataset_id2) # Extract relevant data from dataset text file image_info, frequencies, plt_ratios = tools.extract_data(dataset_id, CasA, CygA, VirA) freq='all' # RMS noise properties noise_avg_log, noise_scatter_log, noise_threshold_log = tools.fit_hist([np.log10(image_info[n][4]*1e3) for n in range(len(image_info))], sigma, r'Observed RMS (mJy)', 'ds'+dataset_id+'_rms', freq) noise_avg=10.**(noise_avg_log) noise_max=10.**(noise_avg_log+noise_scatter_log)-10.**(noise_avg_log) noise_min=10.**(noise_avg_log)-10.**(noise_avg_log-noise_scatter_log) print 'Average RMS Noise in images (1 sigma range, frequency='+str(freq)+' MHz): '+str(round(noise_avg,1))+' (+'+str(round(noise_max,1))+',-'+str(round(noise_min,1))+') mJy' if plt_ratios: # RMS/Theoretical limit for TraP ratio_avg_log, ratio_scatter_log, ratio_threshold_log = tools.fit_hist([np.log10(image_info[n][6]) for n in range(len(image_info))], sigma, r'Observed RMS / Theoretical Noise', 'ds'+dataset_id+'_ratio', freq) ratio_avg=10.**(ratio_avg_log)
def validate_variable_content_multilevel(context, path, value): assert_somewhere_in(json.loads(value), get_data(context.response, path))
'OperatingSystems', 'Browser', 'Region', 'TrafficType', # 'VisitorType', # 'Weekend', # 'Revenue' } data_fixed = same_distribution(data_fixed, class_key) number_classes = data_fixed[class_key].nunique() sampled_data = data_fixed.sample(n=3000) data, cluster_gt = get_data(data_fixed, to_keep, class_key) CLUSTERING_TO_STEPS_MAP = { cluster.KMeans: range(2, 10), cluster.DBSCAN: [float(i)/20 for i in range(1, 20)], mixture.GaussianMixture: range(2, 10), cluster.SpectralClustering: range(2, 10), cluster.AgglomerativeClustering: range(2, 10) } CLUSTERING_ALGOS = [ cluster.KMeans, cluster.DBSCAN, mixture.GaussianMixture, cluster.SpectralClustering, cluster.AgglomerativeClustering ]
def define_variable_from_result(context, variable, path): context.s[variable] = get_data(context.response, path)
def validate_variable_content(context, path, value): assert_in(json.loads(value), get_data(context.response, path))