def main(domainxml, trainingsetcsv, manifold_value, restrictionstxt):
    restrictions = dataset.restrictions_from_text(restrictionstxt)

    cols, data = dataset.read(trainingsetcsv.read(), True, restrictions)
    expected, actual, expected_hunked, actual_hunked = sampling.cross_validate(data, list(cols), manifold_value)
    print("Overall confusion matrix:")
    print(sampling.confusion_matrix(expected, actual))

    print("\nOverall recall:")
    print(sampling.recall(expected, actual, "Obama"))

    print("\nOverall precision:")
    print(sampling.precision(expected, actual, "Obama"))

    print("\nOverall pf:")
    print(sampling.pf(expected, actual, "Obama"))

    print("\nOverall f-measure:")
    print(sampling.f_measure(expected, actual, "Obama"))

    print("\nOverall accuracy:")
    print(sampling.accuracy(expected, actual))

    print("\nAverage accuracy:")
    print(sum(sampling.accuracy(e, a) for e, a in zip(expected_hunked, actual_hunked)) / len(expected_hunked))

    print("\nOverall error rate:")
    print(sampling.error_rate(expected, actual))

    print("\nAverage error rate:")
    print(sum(sampling.error_rate(e, a) for e, a in zip(expected_hunked, actual_hunked)) / len(expected_hunked))
    def test_get_dataset_infomration_404(self):
        app.config["TESTING"] = True
        self.app = app.test_client()

        with self.assertRaises(HTTPException) as http_error:
            # retrieve current API response to request
            self.assertEqual(dataset.read(disease_name="foobar"), 404)
示例#3
0
def build_models():
    classifiers = {
        'random_forest':
        RandomForestClassifier(random_state=42, n_estimators=100),
        'naive_bayes':
        GaussianNB(),
        '1nn':
        KNeighborsClassifier(1),
        '3nn':
        KNeighborsClassifier(1),
        '5nn':
        KNeighborsClassifier(1),
        'decision_tree':
        DecisionTreeClassifier(random_state=42),
        'svm':
        SVC(C=10,
            cache_size=200,
            class_weight=None,
            coef0=0.0,
            decision_function_shape='ovr',
            degree=3,
            gamma='scale',
            kernel='rbf',
            max_iter=-1,
            probability=False,
            random_state=42,
            shrinking=True,
            tol=0.001,
            verbose=False)
    }
    general = d.remove_extras(d.general(d.read('./dataset.csv')))
    Xg = general[general.columns[:-1]]
    yg = general[general.columns[-1]]

    specific = d.remove_extras(d.specific(d.read('./dataset.csv')))
    Xs = specific[specific.columns[:-1]]
    ys = specific[specific.columns[-1]]

    for clf in classifiers:
        pipelined = make_pipeline(StandardScaler(), classifiers[clf])
        pipelined.fit(Xg, yg)
        get_path = lambda p: os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            '../models/%s/%s.joblib' % (p, clf))
        dump(pipelined, get_path('general'))
        pipelined.fit(Xs, ys)
        dump(pipelined, get_path('specific'))
    def test_get_dataset_information_all(self):
        app.config["TESTING"] = True
        self.app = app.test_client()

        # retrieve correct database response to request
        mock_response = test_read()

        # retrieve current API response to request
        api_response = dataset.read()

        # assert that the two output the same
        self.assertEqual(mock_response, api_response)
    def test_get_dataset_information_specific(self):
        app.config["TESTING"] = True
        self.app = app.test_client()

        # retrieve correct database response to request
        mock_response = test_read(disease_name= "breast invasive carcinoma")

        # retrieve current API response to request
        api_response = dataset.read(disease_name= "breast invasive carcinoma")

        # assert that the two output the same
        self.assertEqual(mock_response, api_response)
示例#6
0
def init():
    global seq2seq, train, test
    input_words, output_words = dataset.read()

    # Creating the network model
    seq2seq = model.AttentionSeq2Seq(input_words, output_words)
    if train:
        seq2seq.train()
        train = False

    if test:
        seq2seq.test()
def main(domainxml, trainingsetcsv, restrictionstxt):
    restrictions = dataset.restrictions_from_text(restrictionstxt)

    cols, data = dataset.read(trainingsetcsv.read(), restrictions)
    # call train function with:
    #   `col_sets` - list of sets per column, NOT including class label
    #   `data` (list of ([train data], class))
    tree = Node("swole", ("true", Label("protein and starches")),
            ("false", Label("sugar"))) # dummy temp tree
    tree = c45.run(data, list(enumerate(cols)), 0)
    tree_xml = stringify_tree(tree)
    sys.stdout.buffer.write(tree_xml)
示例#8
0
def train():
    """
	
	:return: 
	"""
    # 读取数据集
    filenames = os.listdir(datasets_dir)
    # 过滤不合格数据集
    for filename in filenames:
        if not os.path.splitext(filename)[1] == '.pickle':
            filenames.remove(filename)

    logits = inference.inference(image_holder, reuse=False)
    global_step = tf.Variable(0, trainable=False)
    # 定义滑动平滑平均值
    variable_averages = tf.train.ExponentialMovingAverage(
        MOVING_AVERAGE, global_step)
    variable_averages_op = variable_averages.apply(tf.trainable_variables())
    # 损失函数值
    loss = inference.loss(logits, label_holder)
    # 使用反向传播函数之前优化学习率
    learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE,
                                               global_step,
                                               MAX_STEPS,
                                               decay_rate=LEARNING_RATE_DECAY)
    # 定义反向传播函数
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(
        loss, global_step=global_step)
    # 使用反向函数和滑动平滑值更新参数
    train_op = tf.group(train_step, variable_averages_op)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.train.start_queue_runners()
        if not os.path.exists(models_dir):
            os.makedirs(models_dir)
        for step in range(MAX_STEPS):
            for filename in filenames:
                train_image, train_label = dataset.read(filename)
                assert isinstance(train_image, list)
                assert isinstance(train_label, list)
                _, loss_value = sess.run([train_op, loss],
                                         feed_dict={
                                             image_holder: train_image,
                                             label_holder: train_label
                                         })
            if step % 2 == 0:
                print("after %d steps, the loss value is %g" %
                      (step, loss_value))
                saver.save(sess, models_file, global_step=step)
def test_get_eprint_xml(t, eprint_url, auth_type, username, secret, collection_name):
    if os.path.exists(collection_name):
        shutil.rmtree(collection_name)
    ok = dataset.init(collection_name)
    if ok == False:
        t.error(f"Can't initialize {collection_name}")
        return
    t.verbose_off() # turn verboseness on for debugging
    test_name = t.test_name()
    cfg = eprinttools.cfg(eprint_url, auth_type, username, secret, collection_name)
    keys = eprinttools.get_keys(cfg)
    if len(keys) == 0:
        t.error(f"Can't test {test_name} without keys, got zero keys")
        return

    collection_keys = []
    check_keys = []
    for i in range(100):
        key = random.choice(keys)
        if key not in check_keys:
            check_keys.append(key)
        if len(check_keys) > 50:
            break
    t.print(f"Calculating the keys in sample that will get stored in the collection {collection_name}")
    for key in check_keys:
        # We are going to try to get the metadata for the EPrint record but not store it in a dataset collectin...
        ok = eprinttools.get_eprint_xml(cfg, key)
        e_msg = eprinttools.error_message()
        if ok == False or e_msg != "":
            if e_msg.startswith("401") == False:
                t.error(f"Expected data for {key}, got {ok}, {e_msg}")
            else:
                t.print(f"found {key}, requires authentication")
        else:
            t.print(f"found {key} with data, checking dataset for record")
            data = dataset.read(collection_name, key)
            e_msg = dataset.error_message()
            if len(data) == 0:
                t.error(f"{key} in {collection_name} empty record, {e_msg}")
            if e_msg != "":
                t.error(f"{key} in {collection_name} error, {e_msg}")
def main(to_classify_csv, decision_tree_xml, restrictionstxt, has_label_column):
    # how are we supposed to determine if this has a label column or not?
    # I guess we could look at the number of unique edge labels in decision tree
    #   to determine features/
    tree = model.build_tree(decision_tree_xml.read())
    restrictions = dataset.restrictions_from_text(restrictionstxt)
    cols, data = dataset.read(to_classify_csv.read(), has_label_column,
            restrictions)

    predicted_classes = [tree.classify(x[0], cols) for x in data]
    labels = [x[1] for x in data]
    if has_label_column:
        print('Records:', len(data))
        print('Correctly classified:',
              sum(1 for p,l in zip(predicted_classes, labels) if p==l))
        print('Incorrectly classified:',
              sum(1 for p,l in zip(predicted_classes, labels) if p!=l))
        print('Accuracy:', sampling.accuracy(labels, predicted_classes))
        print('Error:', sampling.error_rate(labels, predicted_classes))
        print('Confusion matrix:')
        print(sampling.confusion_matrix(labels, predicted_classes))
    else:
        for i in range(len(predicted_classes)):
            print(data[i][0], predicted_classes[i])
示例#11
0
def main(args):
    # Determine which algorithms to perform
    algorithms = []
    if args.bf:
        algorithms.append(wrp.AlgorithmWrapper(bf.CONTENT))
    if args.nn:
        algorithms.append(wrp.AlgorithmWrapper(nn.CONTENT))
    if args.ni:
        algorithms.append(wrp.AlgorithmWrapper(ni.CONTENT))
    if args.mst:
        algorithms.append(wrp.AlgorithmWrapper(mst.CONTENT))
    if args.ci:
        algorithms.append(wrp.AlgorithmWrapper(ci.CONTENT))

    # Initialize plots
    fig_correct, fig_complex, plot_correct, plot_complex = init_plots(
        algorithms)

    # Execute correct command
    if args.cmd == 'read':
        datasets = dataset.read(args.path)
        for ds in datasets:
            for algorithm in algorithms:
                y1, y2 = analyse_algorithm(ds.adj, ds.order, algorithm,
                                           args.repeat)
                plot_correct.scatter(ds.order,
                                     y2,
                                     color=algorithm.color,
                                     alpha=0.5,
                                     s=0.5)
                plot_complex.scatter(ds.order,
                                     y1,
                                     color=algorithm.color,
                                     alpha=0.5,
                                     s=0.5)

    elif args.cmd == 'random':
        if args.write:
            if not os.path.exists('datasets'):
                os.makedirs('datasets')

        order = args.order  # reset n
        while order <= args.max:
            for i in range(args.trials):
                path = None
                if args.write:
                    path = "datasets/order_{}_trial_{}.dat".format(order, i)
                adj = dataset.generate(order, args.spread, path)
                for algorithm in algorithms:
                    y1, y2 = analyse_algorithm(adj, order, algorithm,
                                               args.repeat)
                    algorithm.x.append(order)
                    algorithm.complex.append(y1)
                    algorithm.working_complex.append(y1)
                    algorithm.correct.append(y2)
                    algorithm.working_correct.append(y2)

            for algorithm in algorithms:
                algorithm.avg_correct.append(
                    util.average(algorithm.working_correct))
                algorithm.avg_complex.append(
                    util.average(algorithm.working_complex))
                algorithm.avg_x.append(order)
                algorithm.working_correct.clear()
                algorithm.working_complex.clear()

            order += 1

        if args.plot:
            for algorithm in algorithms:
                # Plot correctness measure
                plot_correct.scatter(algorithm.x,
                                     algorithm.correct,
                                     color=algorithm.color,
                                     alpha=0.5,
                                     s=0.5)
                plot_correct.plot(algorithm.avg_x,
                                  algorithm.avg_correct,
                                  '-',
                                  color=algorithm.color,
                                  linewidth=0.5)
                fig_correct.savefig('Correctness',
                                    dpi=300,
                                    bbox_inches='tight')

                # Plot complexity measure
                plot_complex.scatter(algorithm.x,
                                     algorithm.complex,
                                     color=algorithm.color,
                                     alpha=0.5,
                                     s=0.5)
                plot_complex.plot(algorithm.avg_x,
                                  algorithm.avg_complex,
                                  '-',
                                  color=algorithm.color,
                                  linewidth=0.5)
                fig_complex.savefig('Complexity', dpi=300, bbox_inches='tight')
def get_wos_refs(new=True):
    #New=True will download everything from scratch and delete any existing records

    collection = 'wos_refs.ds'

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        ok = dataset.init(collection)
        if ok == False:
            print("Dataset failed to init collection")
            exit()

    #Run query to get scope of records
    token = os.environ['WOSTOK']
    headers = {'X-ApiKey': token, 'Content-type': 'application/json'}

    base_url = 'https://api.clarivate.com/api/wos/?databaseId=WOK'

    collected = dataset.has_key(collection, "captured")

    if collected == True:
        date = dataset.read(collection, "captured")
        date = date[0]['captured']
        date = datetime.fromisoformat(date)
        current = datetime.today()
        diff = (current - date)
        base_url = base_url + '&loadTimeSpan=' + str(diff.days) + 'D'

    url = base_url + '&count=1&firstRecord=1&usrQuery=OG=California%20Institute%20of%20Technology'

    incomplete = dataset.has_key(collection, "incomplete")

    if incomplete == True:
        query = dataset.read(collection, "incomplete")
        query_id = query[0]['incomplete']
        query = dataset.read(collection, "record_start")
        record_start = query[0]['record_start']
        query = dataset.read(collection, "record_count")
        record_count = query[0]['record_count']

    else:
        response = requests.get(url, headers=headers)
        response = response.json()
        record_count = response['QueryResult']['RecordsFound']
        print(record_count)
        query_id = response['QueryResult']['QueryID']

        dataset.create(collection, 'incomplete', {"incomplete": query_id})

        record_start = 1

        dataset.create(collection, 'record_start',
                       {"record_start": record_start})
        dataset.create(collection, 'record_count',
                       {"record_count": record_start})

    query_url = 'https://api.clarivate.com/api/wos/query/'

    while record_count > 0:
        print(record_start)
        if record_count > 100:
            url = query_url + str(query_id) + '?count=100&firstRecord=' +\
                str(record_start)
            response = requests.get(url, headers=headers)
            response = response.json()
            print(response)
            save_records(collection, response['Records']['records']['REC'])
            record_start = record_start + 100
            record_count = record_count - 100
            dataset.update(collection, 'record_start',
                           {"record_start": record_start})
            dataset.update(collection, 'record_count',
                           {"record_count": record_count})
        else:
            url = query_url + str(query_id) + '?count=' +\
            str(record_count) + '&firstRecord='+ str(record_start)
            response = requests.get(url, headers=headers)
            response = response.json()
            save_records(collection, response['Records']['records']['REC'])
            record_count = 0

    date = datetime.today().isoformat()
    record = {"captured": date}
    if dataset.has_key(collection, "captured"):
        err = dataset.update(collection, 'captured', record)
        if err != "":
            print(f"Unexpected error on update: {err}")
    else:
        err = dataset.create(collection, 'captured', record)
        if err != "":
            print(f"Unexpected error on create: {err}")

    dataset.delete(collection, 'incomplete')
示例#13
0
def create(dataset):
    return logged_in() and ds.read(dataset)
示例#14
0
def read(dataset, view):
    return ds.read(dataset)
# exists and be populated.
#
import dataset
import os

c = "Journals.ds"
keys = dataset.keys(c)
print("package clsrules")
print("")
print("var (")

# Generate a ISSN to Publisher Map
print("issnPublisher = map[string]string{")
for key in keys:
    try: 
        rec, err = dataset.read(c, key)
    except:
        rec = { "_Key": key, "publisher":"" }
    if err != "":
        print(f"// ERROR ({key}): {err}")
    print(f"    \"{rec['_Key']}\":\"{rec['publisher']}\",")
print("}")
print("")

# Generate a ISSN to Publication Map
print("issnPublication = map[string]string{")
for key in keys:
    try: 
        rec, err = dataset.read(c, key)
    except:
        rec = { "_Key": key, "publication":"" }
示例#16
0
import numpy as np
import os
import time

import squeezenet

dataset = dataset.Data_set()
dataset.open('./face_photos',8)
dataset.shuffle()
#dataset.read(30)

#[None,224,224,3]

# x = tf.placeholder(tf.float32,shape=[None,224,224,3])
# y = tf.placeholder(tf.float32,shape=[None,5])
x, y = dataset.read()
# print(x)
x = tf.reshape(x,shape=[-1,224,224,3])
# x = tf.transpose(x,[0,3,1,2])
print(x)

class netInit(object):
    num_classes=5
    weight_decay=0.1
    batch_norm_decay=0.999
    
net = squeezenet.Squeezenet(netInit)
print('new net',net)
net = net.build(x, is_training=True)
print("build net",net)
# net = tf.reshape(net,[-1,3490*5])
示例#17
0
    stop(err)
err = dataset.create("friends.ds", "mojo", {
    "name": "Mojo Sam, the Yudoo Man",
    "email": "*****@*****.**"
})
if err != '':
    stop(err)
err = dataset.create("friends.ds", "jack", {
    "name": "Jack Flanders",
    "email": "*****@*****.**"
})
if err != '':
    stop(err)

## read
(frieda_profile, err) = dataset.read("friends.ds", "frieda")
if err != '':
    stop(err)
(mojo_profile, err) = dataset.read("friends.ds", "mojo")
if err != '':
    stop(err)
(jack_profile, err) = dataset.read("friends.ds", "jack")
if err != '':
    stop(err)

## update

frieda_profile["catch_phrase"] = "Wowee Zowee"
mojo_profile["catch_phrase"] = "Feet Don't Fail Me Now!"
jack_profile["catch_phrase"] = "What is coming at you is coming from you"
示例#18
0
def read_dataset():
    return dataset.read()
示例#19
0
def create(dataset):
    return logged_in() and ds.read(dataset)
示例#20
0
import tensorflow as tf
import numpy as np
import model
import dataset
import time

import config as myconfig


data = dataset.read()

batch_size = 32
learning_rate = 0.0001
beta1 = 0.5
z_size = 5
save_interval = 10

###  input variables
z = tf.placeholder(tf.float32, [batch_size, z_size])
a = tf.placeholder(tf.float32, [batch_size, 32, 32, 32, 1])
rgba = tf.placeholder(tf.float32, [batch_size, 32, 32, 32, 4])
train = tf.placeholder(tf.bool)

### build models
G = model.Generator(z_size)
D = model.Discriminator()

rgba_ = G(a, z, train)
y_ = D(rgba_, train)
y = D(rgba, train)
示例#21
0
def read(dataset, view):
    return ds.read(dataset)