def main(domainxml, trainingsetcsv, manifold_value, restrictionstxt): restrictions = dataset.restrictions_from_text(restrictionstxt) cols, data = dataset.read(trainingsetcsv.read(), True, restrictions) expected, actual, expected_hunked, actual_hunked = sampling.cross_validate(data, list(cols), manifold_value) print("Overall confusion matrix:") print(sampling.confusion_matrix(expected, actual)) print("\nOverall recall:") print(sampling.recall(expected, actual, "Obama")) print("\nOverall precision:") print(sampling.precision(expected, actual, "Obama")) print("\nOverall pf:") print(sampling.pf(expected, actual, "Obama")) print("\nOverall f-measure:") print(sampling.f_measure(expected, actual, "Obama")) print("\nOverall accuracy:") print(sampling.accuracy(expected, actual)) print("\nAverage accuracy:") print(sum(sampling.accuracy(e, a) for e, a in zip(expected_hunked, actual_hunked)) / len(expected_hunked)) print("\nOverall error rate:") print(sampling.error_rate(expected, actual)) print("\nAverage error rate:") print(sum(sampling.error_rate(e, a) for e, a in zip(expected_hunked, actual_hunked)) / len(expected_hunked))
def test_get_dataset_infomration_404(self): app.config["TESTING"] = True self.app = app.test_client() with self.assertRaises(HTTPException) as http_error: # retrieve current API response to request self.assertEqual(dataset.read(disease_name="foobar"), 404)
def build_models(): classifiers = { 'random_forest': RandomForestClassifier(random_state=42, n_estimators=100), 'naive_bayes': GaussianNB(), '1nn': KNeighborsClassifier(1), '3nn': KNeighborsClassifier(1), '5nn': KNeighborsClassifier(1), 'decision_tree': DecisionTreeClassifier(random_state=42), 'svm': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001, verbose=False) } general = d.remove_extras(d.general(d.read('./dataset.csv'))) Xg = general[general.columns[:-1]] yg = general[general.columns[-1]] specific = d.remove_extras(d.specific(d.read('./dataset.csv'))) Xs = specific[specific.columns[:-1]] ys = specific[specific.columns[-1]] for clf in classifiers: pipelined = make_pipeline(StandardScaler(), classifiers[clf]) pipelined.fit(Xg, yg) get_path = lambda p: os.path.join( os.path.dirname(os.path.abspath(__file__)), '../models/%s/%s.joblib' % (p, clf)) dump(pipelined, get_path('general')) pipelined.fit(Xs, ys) dump(pipelined, get_path('specific'))
def test_get_dataset_information_all(self): app.config["TESTING"] = True self.app = app.test_client() # retrieve correct database response to request mock_response = test_read() # retrieve current API response to request api_response = dataset.read() # assert that the two output the same self.assertEqual(mock_response, api_response)
def test_get_dataset_information_specific(self): app.config["TESTING"] = True self.app = app.test_client() # retrieve correct database response to request mock_response = test_read(disease_name= "breast invasive carcinoma") # retrieve current API response to request api_response = dataset.read(disease_name= "breast invasive carcinoma") # assert that the two output the same self.assertEqual(mock_response, api_response)
def init(): global seq2seq, train, test input_words, output_words = dataset.read() # Creating the network model seq2seq = model.AttentionSeq2Seq(input_words, output_words) if train: seq2seq.train() train = False if test: seq2seq.test()
def main(domainxml, trainingsetcsv, restrictionstxt): restrictions = dataset.restrictions_from_text(restrictionstxt) cols, data = dataset.read(trainingsetcsv.read(), restrictions) # call train function with: # `col_sets` - list of sets per column, NOT including class label # `data` (list of ([train data], class)) tree = Node("swole", ("true", Label("protein and starches")), ("false", Label("sugar"))) # dummy temp tree tree = c45.run(data, list(enumerate(cols)), 0) tree_xml = stringify_tree(tree) sys.stdout.buffer.write(tree_xml)
def train(): """ :return: """ # 读取数据集 filenames = os.listdir(datasets_dir) # 过滤不合格数据集 for filename in filenames: if not os.path.splitext(filename)[1] == '.pickle': filenames.remove(filename) logits = inference.inference(image_holder, reuse=False) global_step = tf.Variable(0, trainable=False) # 定义滑动平滑平均值 variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE, global_step) variable_averages_op = variable_averages.apply(tf.trainable_variables()) # 损失函数值 loss = inference.loss(logits, label_holder) # 使用反向传播函数之前优化学习率 learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, MAX_STEPS, decay_rate=LEARNING_RATE_DECAY) # 定义反向传播函数 train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss, global_step=global_step) # 使用反向函数和滑动平滑值更新参数 train_op = tf.group(train_step, variable_averages_op) saver = tf.train.Saver() with tf.Session() as sess: tf.global_variables_initializer().run() tf.train.start_queue_runners() if not os.path.exists(models_dir): os.makedirs(models_dir) for step in range(MAX_STEPS): for filename in filenames: train_image, train_label = dataset.read(filename) assert isinstance(train_image, list) assert isinstance(train_label, list) _, loss_value = sess.run([train_op, loss], feed_dict={ image_holder: train_image, label_holder: train_label }) if step % 2 == 0: print("after %d steps, the loss value is %g" % (step, loss_value)) saver.save(sess, models_file, global_step=step)
def test_get_eprint_xml(t, eprint_url, auth_type, username, secret, collection_name): if os.path.exists(collection_name): shutil.rmtree(collection_name) ok = dataset.init(collection_name) if ok == False: t.error(f"Can't initialize {collection_name}") return t.verbose_off() # turn verboseness on for debugging test_name = t.test_name() cfg = eprinttools.cfg(eprint_url, auth_type, username, secret, collection_name) keys = eprinttools.get_keys(cfg) if len(keys) == 0: t.error(f"Can't test {test_name} without keys, got zero keys") return collection_keys = [] check_keys = [] for i in range(100): key = random.choice(keys) if key not in check_keys: check_keys.append(key) if len(check_keys) > 50: break t.print(f"Calculating the keys in sample that will get stored in the collection {collection_name}") for key in check_keys: # We are going to try to get the metadata for the EPrint record but not store it in a dataset collectin... ok = eprinttools.get_eprint_xml(cfg, key) e_msg = eprinttools.error_message() if ok == False or e_msg != "": if e_msg.startswith("401") == False: t.error(f"Expected data for {key}, got {ok}, {e_msg}") else: t.print(f"found {key}, requires authentication") else: t.print(f"found {key} with data, checking dataset for record") data = dataset.read(collection_name, key) e_msg = dataset.error_message() if len(data) == 0: t.error(f"{key} in {collection_name} empty record, {e_msg}") if e_msg != "": t.error(f"{key} in {collection_name} error, {e_msg}")
def main(to_classify_csv, decision_tree_xml, restrictionstxt, has_label_column): # how are we supposed to determine if this has a label column or not? # I guess we could look at the number of unique edge labels in decision tree # to determine features/ tree = model.build_tree(decision_tree_xml.read()) restrictions = dataset.restrictions_from_text(restrictionstxt) cols, data = dataset.read(to_classify_csv.read(), has_label_column, restrictions) predicted_classes = [tree.classify(x[0], cols) for x in data] labels = [x[1] for x in data] if has_label_column: print('Records:', len(data)) print('Correctly classified:', sum(1 for p,l in zip(predicted_classes, labels) if p==l)) print('Incorrectly classified:', sum(1 for p,l in zip(predicted_classes, labels) if p!=l)) print('Accuracy:', sampling.accuracy(labels, predicted_classes)) print('Error:', sampling.error_rate(labels, predicted_classes)) print('Confusion matrix:') print(sampling.confusion_matrix(labels, predicted_classes)) else: for i in range(len(predicted_classes)): print(data[i][0], predicted_classes[i])
def main(args): # Determine which algorithms to perform algorithms = [] if args.bf: algorithms.append(wrp.AlgorithmWrapper(bf.CONTENT)) if args.nn: algorithms.append(wrp.AlgorithmWrapper(nn.CONTENT)) if args.ni: algorithms.append(wrp.AlgorithmWrapper(ni.CONTENT)) if args.mst: algorithms.append(wrp.AlgorithmWrapper(mst.CONTENT)) if args.ci: algorithms.append(wrp.AlgorithmWrapper(ci.CONTENT)) # Initialize plots fig_correct, fig_complex, plot_correct, plot_complex = init_plots( algorithms) # Execute correct command if args.cmd == 'read': datasets = dataset.read(args.path) for ds in datasets: for algorithm in algorithms: y1, y2 = analyse_algorithm(ds.adj, ds.order, algorithm, args.repeat) plot_correct.scatter(ds.order, y2, color=algorithm.color, alpha=0.5, s=0.5) plot_complex.scatter(ds.order, y1, color=algorithm.color, alpha=0.5, s=0.5) elif args.cmd == 'random': if args.write: if not os.path.exists('datasets'): os.makedirs('datasets') order = args.order # reset n while order <= args.max: for i in range(args.trials): path = None if args.write: path = "datasets/order_{}_trial_{}.dat".format(order, i) adj = dataset.generate(order, args.spread, path) for algorithm in algorithms: y1, y2 = analyse_algorithm(adj, order, algorithm, args.repeat) algorithm.x.append(order) algorithm.complex.append(y1) algorithm.working_complex.append(y1) algorithm.correct.append(y2) algorithm.working_correct.append(y2) for algorithm in algorithms: algorithm.avg_correct.append( util.average(algorithm.working_correct)) algorithm.avg_complex.append( util.average(algorithm.working_complex)) algorithm.avg_x.append(order) algorithm.working_correct.clear() algorithm.working_complex.clear() order += 1 if args.plot: for algorithm in algorithms: # Plot correctness measure plot_correct.scatter(algorithm.x, algorithm.correct, color=algorithm.color, alpha=0.5, s=0.5) plot_correct.plot(algorithm.avg_x, algorithm.avg_correct, '-', color=algorithm.color, linewidth=0.5) fig_correct.savefig('Correctness', dpi=300, bbox_inches='tight') # Plot complexity measure plot_complex.scatter(algorithm.x, algorithm.complex, color=algorithm.color, alpha=0.5, s=0.5) plot_complex.plot(algorithm.avg_x, algorithm.avg_complex, '-', color=algorithm.color, linewidth=0.5) fig_complex.savefig('Complexity', dpi=300, bbox_inches='tight')
def get_wos_refs(new=True): #New=True will download everything from scratch and delete any existing records collection = 'wos_refs.ds' if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: ok = dataset.init(collection) if ok == False: print("Dataset failed to init collection") exit() #Run query to get scope of records token = os.environ['WOSTOK'] headers = {'X-ApiKey': token, 'Content-type': 'application/json'} base_url = 'https://api.clarivate.com/api/wos/?databaseId=WOK' collected = dataset.has_key(collection, "captured") if collected == True: date = dataset.read(collection, "captured") date = date[0]['captured'] date = datetime.fromisoformat(date) current = datetime.today() diff = (current - date) base_url = base_url + '&loadTimeSpan=' + str(diff.days) + 'D' url = base_url + '&count=1&firstRecord=1&usrQuery=OG=California%20Institute%20of%20Technology' incomplete = dataset.has_key(collection, "incomplete") if incomplete == True: query = dataset.read(collection, "incomplete") query_id = query[0]['incomplete'] query = dataset.read(collection, "record_start") record_start = query[0]['record_start'] query = dataset.read(collection, "record_count") record_count = query[0]['record_count'] else: response = requests.get(url, headers=headers) response = response.json() record_count = response['QueryResult']['RecordsFound'] print(record_count) query_id = response['QueryResult']['QueryID'] dataset.create(collection, 'incomplete', {"incomplete": query_id}) record_start = 1 dataset.create(collection, 'record_start', {"record_start": record_start}) dataset.create(collection, 'record_count', {"record_count": record_start}) query_url = 'https://api.clarivate.com/api/wos/query/' while record_count > 0: print(record_start) if record_count > 100: url = query_url + str(query_id) + '?count=100&firstRecord=' +\ str(record_start) response = requests.get(url, headers=headers) response = response.json() print(response) save_records(collection, response['Records']['records']['REC']) record_start = record_start + 100 record_count = record_count - 100 dataset.update(collection, 'record_start', {"record_start": record_start}) dataset.update(collection, 'record_count', {"record_count": record_count}) else: url = query_url + str(query_id) + '?count=' +\ str(record_count) + '&firstRecord='+ str(record_start) response = requests.get(url, headers=headers) response = response.json() save_records(collection, response['Records']['records']['REC']) record_count = 0 date = datetime.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): err = dataset.update(collection, 'captured', record) if err != "": print(f"Unexpected error on update: {err}") else: err = dataset.create(collection, 'captured', record) if err != "": print(f"Unexpected error on create: {err}") dataset.delete(collection, 'incomplete')
def create(dataset): return logged_in() and ds.read(dataset)
def read(dataset, view): return ds.read(dataset)
# exists and be populated. # import dataset import os c = "Journals.ds" keys = dataset.keys(c) print("package clsrules") print("") print("var (") # Generate a ISSN to Publisher Map print("issnPublisher = map[string]string{") for key in keys: try: rec, err = dataset.read(c, key) except: rec = { "_Key": key, "publisher":"" } if err != "": print(f"// ERROR ({key}): {err}") print(f" \"{rec['_Key']}\":\"{rec['publisher']}\",") print("}") print("") # Generate a ISSN to Publication Map print("issnPublication = map[string]string{") for key in keys: try: rec, err = dataset.read(c, key) except: rec = { "_Key": key, "publication":"" }
import numpy as np import os import time import squeezenet dataset = dataset.Data_set() dataset.open('./face_photos',8) dataset.shuffle() #dataset.read(30) #[None,224,224,3] # x = tf.placeholder(tf.float32,shape=[None,224,224,3]) # y = tf.placeholder(tf.float32,shape=[None,5]) x, y = dataset.read() # print(x) x = tf.reshape(x,shape=[-1,224,224,3]) # x = tf.transpose(x,[0,3,1,2]) print(x) class netInit(object): num_classes=5 weight_decay=0.1 batch_norm_decay=0.999 net = squeezenet.Squeezenet(netInit) print('new net',net) net = net.build(x, is_training=True) print("build net",net) # net = tf.reshape(net,[-1,3490*5])
stop(err) err = dataset.create("friends.ds", "mojo", { "name": "Mojo Sam, the Yudoo Man", "email": "*****@*****.**" }) if err != '': stop(err) err = dataset.create("friends.ds", "jack", { "name": "Jack Flanders", "email": "*****@*****.**" }) if err != '': stop(err) ## read (frieda_profile, err) = dataset.read("friends.ds", "frieda") if err != '': stop(err) (mojo_profile, err) = dataset.read("friends.ds", "mojo") if err != '': stop(err) (jack_profile, err) = dataset.read("friends.ds", "jack") if err != '': stop(err) ## update frieda_profile["catch_phrase"] = "Wowee Zowee" mojo_profile["catch_phrase"] = "Feet Don't Fail Me Now!" jack_profile["catch_phrase"] = "What is coming at you is coming from you"
def read_dataset(): return dataset.read()
import tensorflow as tf import numpy as np import model import dataset import time import config as myconfig data = dataset.read() batch_size = 32 learning_rate = 0.0001 beta1 = 0.5 z_size = 5 save_interval = 10 ### input variables z = tf.placeholder(tf.float32, [batch_size, z_size]) a = tf.placeholder(tf.float32, [batch_size, 32, 32, 32, 1]) rgba = tf.placeholder(tf.float32, [batch_size, 32, 32, 32, 4]) train = tf.placeholder(tf.bool) ### build models G = model.Generator(z_size) D = model.Discriminator() rgba_ = G(a, z, train) y_ = D(rgba_, train) y = D(rgba, train)