def test_init(self):
     fetcher = DataFetcher(filepath)
     fetcher = DataFetcher(filepath + "/")
     with self.assertRaises(IOError):
         fetcher = DataFetcher(filepath + "/test")
         fetcher = DataFetcher("/test")
         fetcher = DataFetcher("")
    def __init__(self, studyID, use_ssh=True):
        #create a datafetcher instance to fetch the data from the database
        self.data_fetcher = DataFetcher(use_ssh)

        self.n_bins = 10

        self.queryStudyID = studyID
Пример #3
0
def main(Selected_Stock, Trading_Day):

    fetcher = DataFetcher()

    fetch_result = fetcher.getHistoricalData(Selected_Stock)
    if fetch_result == -1:
        raise Exception("NO INTERNET CONNECTIVITY OR INVALID STOCK SYMBOL")
    dir_name = os.path.dirname(os.path.abspath(__file__))

    CSVFile = os.path.join(dir_name, "Dataset", Selected_Stock + ".csv")

    ohclv_data = list(getData(CSVFile))

    #current_data = regression(ohclv_data)
    #ohclv_data.append(current_data)
    ohclv_data = np.array(ohclv_data)

    X, y = prepareData(ohclv_data, Trading_Day)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
    model = RandomForestClassifier(n_estimators=35, criterion="gini")
    model.fit(Xtrain, ytrain)

    y_pred = model.predict(Xtest)
    output = model.predict(X[-1].reshape(1, -1))

    Eval = Evaluator(Xtest, ytest, y_pred, model)
    accuracy, recall, precision, specificity = Eval.getPerformanceMetrics()

    print accuracy
    print recall
    print precision
    print specificity
    raw_input("Press enter to genereate OOB vs Number of estimators graph:")
    Eval.oob_vs_n_trees(100, Xtrain, ytrain)
Пример #4
0
 def rule_based(self, issues):
     """
     This method applies rule_based algorithms to predict labels
     Args:
         issues(list): a list of issue numbers
     Return:
         rule_based_predictions(list of lists): labels which satisfy rules
     """
     DF = DataFetcher()
     df_test = DF.fetch_issues(issues)
     rule_based_predictions = []
     for i in range(len(issues)):
         # extract every issue's title
         row = df_test.loc[i, 'title']
         # apply rule-based algorithms
         single_issue_predictions = []
         if "feature request" in row.lower():
             single_issue_predictions.append("Feature")
         if "c++" in row.lower():
             single_issue_predictions.append("C++")
         tokens = self.tokenize(row)
         for k, v in self.keywords.items():
             for keyword in v:
                 if keyword in tokens:
                     single_issue_predictions.append(k)
         rule_based_predictions.append(single_issue_predictions)
     return rule_based_predictions
def main(Selected_Stock, Trading_Day):

	fetcher = DataFetcher()
	#fetch_result = fetcher.getHistoricalData(Selected_Stock)

	dir_name = os.path.dirname(os.path.abspath(__file__))
	CSVFile = os.path.join(dir_name,Selected_Stock + ".csv")

	if os.path.isfile(CSVFile):
		last_modified_date = datetime.fromtimestamp(os.path.getmtime(CSVFile))
		
	current_data = fetcher.getCurrentData(Selected_Stock,"ohclv")

	ohclv_data = list(getData(CSVFile))
	ohclv_data.append(current_data)
	ohclv_data = np.array(ohclv_data)


	X,y = prepareData(ohclv_data, Trading_Day)
	Xtrain,Xtest,ytrain,ytest = train_test_split(X,y)
	model = RandomForestClassifier(n_estimators = 30,criterion = "gini")
	model.fit(Xtrain,ytrain)

	y_pred = model.predict(Xtest)
	accuracy = accuracy_score(ytest,y_pred)
	precision = precision_score(ytest,y_pred)
	recall = recall_score(ytest,y_pred)
	specificity = specificity_score(ytest,y_pred)
	output = model.predict(X[-1].reshape(1,-1))
	return accuracy, output,current_data,precision,recall, specificity
Пример #6
0
def main():
    fetcher = DataFetcher("cookies.txt")
    data = fetcher.getItems(
        "https://www.couchsurfing.com/members/hosts?utf8=%E2%9C%93&search_query=Curitiba%2C+Brazil&latitude=-25.4244287&longitude=-49.2653819&country=Brazil&region=south-america&date_modal_dismissed=true&arrival_date=&departure_date=&num_guests=1&has_references=1&can_host%5Baccepting_guests%5D=1&last_login=Anytime&join_date=Anytime&gender=All&min_age=&max_age=&languages_spoken=&interests=&smoking=No+Preference&radius=10&keyword=&host_sort=Best+Match&button=&perPage=100",
        "h3",
        className="-name")
    usuarios = [
        Host(u.a.string, u.a['href'][len("/users/"):], u.a['href'])
        for u in data
    ]

    arquivo = open("usuarios.csv", "w")
    arquivo_usuarios = csv.DictWriter(arquivo,
                                      fieldnames=["nome", "id", "endereco"],
                                      lineterminator='\n')
    arquivo_usuarios.writeheader()

    for user in usuarios:
        arquivo_usuarios.writerow({
            'nome': user.nome,
            'id': user.id,
            'endereco': user.endereco.strip()
        })

    arquivo.close()
Пример #7
0
def init_tweetsprocessor():
    global tweetsProcessor
    dataFetcher = DataFetcher()
    #tweets_dict = dataFetcher.run_all()
    dataFetcher.parse_data()
    tweets_dict = dataFetcher.get_dict()
    tweetsProcessor = TweetsProcessor(tweets_dict)
    tweetsProcessor.prepare()
Пример #8
0
 def train(self):
     """
     This method is to train and save models.
     It has 5 steps:
     1. Fetch issues
     2. Clean data
     3. Word embedding
     4. Train models
     5. Save models
     """
     logging.info("Start training issues of general labels")
     # Step1: Fetch issues with general labels
     logging.info("Fetching Data..")
     DF = DataFetcher()
     filename = DF.data2json('all', self.labels, False)
     # Step2: Clean data
     logging.info("Cleaning Data..")
     SP = SentenceParser()
     SP.read_file(filename, 'json')
     SP.clean_body('body', True, True)
     SP.merge_column(['title', 'title', 'title', 'body'], 'train')
     text = SP.process_text('train', True, False, True)
     df = SP.data
     # Step3: Word Embedding
     logging.info("Word Embedding..")
     # tv = TfidfVectorizer(min_df=0.00009, ngram_range=(1, 3), max_features=10000)
     tv = self.tv
     X = tv.fit_transform(text).toarray()
     # Labels
     labels = SP.data['labels']
     le = LabelEncoder()
     Y = le.fit_transform(labels)
     # Step4: Train Classifier
     # SVC, kernel = 'rbf'
     logging.info("Training Data..")
     # clf = SVC(gamma=0.5, C=100, probability=True)
     clf = self.clf
     clf.fit(X, Y)
     # Step5: save models
     logging.info("Saving Models..")
     with open(os.path.join(self.tmp_dir.name, 'Vectorizer.p'),
               'wb') as tv_file:
         pickle.dump(tv, tv_file)
     with open(os.path.join(self.tmp_dir.name, 'Classifier.p'),
               'wb') as clf_file:
         pickle.dump(clf, clf_file)
     with open(os.path.join(self.tmp_dir.name, 'Labels.p'),
               'wb') as labels_file:
         pickle.dump(labels, labels_file)
     logging.info("Completed!")
     return self.tmp_dir
    def test_concat(self):
        fetcher = DataFetcher(filepath)
        metadata = fetcher._DataFetcher__fetch_metadata()
        metadata = metadata.sample(n=5)
        left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False)

        # test concat for averaged data
        left_avg = _average_trials(left)
        left_concat = fetcher._DataFetcher__concat(left_avg)
        order = fetcher.get_comp_order()
        assert len(left_concat.keys()
                   ) == 1, "Too many keys in dictionary after concatenation."
        assert "concat" in left_concat.keys(
        ), "'concat' is no key in the dictionary."
        assert left_concat["concat"].iloc[:, :103].equals(
            left_avg[order[0]]
        ), "Component {} does not match in concatenated dict.".format(order[0])
        nextIndex = 103
        lastColumn = left_avg[order[0]].iloc[:, -1]
        for component in order[1:]:
            carry = lastColumn - left_avg[component].iloc[:, 2]
            assert left_concat["concat"].iloc[:, nextIndex:(
                nextIndex + 101)].equals(
                    left_avg[component].iloc[:, 2:].add(carry, axis="index")
                ), "Component {} does not match in concatenated dict.".format(
                    component)
            nextIndex += 101
            lastColumn = left_avg[component].iloc[:, -1].add(carry,
                                                             axis="index")

        # test for non-averaged data
        right_concat = fetcher._DataFetcher__concat(right)
        order = fetcher.get_comp_order()
        assert len(right_concat.keys()
                   ) == 1, "Too many keys in dictionary after concatenation."
        assert "concat" in right_concat.keys(
        ), "'concat' is no key in the dictionary."
        assert right_concat["concat"].iloc[:, :104].equals(
            right[order[0]]
        ), "Component {} does not match in concatenated dict.".format(order[0])
        nextIndex = 104
        lastColumn = right[order[0]].iloc[:, -1]
        for component in order[1:]:
            carry = lastColumn - right[component].iloc[:, 3]
            assert right_concat["concat"].iloc[:, nextIndex:(
                nextIndex + 101)].equals(
                    right[component].iloc[:, 3:].add(carry, axis="index")
                ), "Component {} does not match in concatenated dict.".format(
                    component)
            nextIndex += 101
            lastColumn = right[component].iloc[:, -1].add(carry, axis="index")
Пример #10
0
 def fetch_content(self, id, json_string):
     """ fetch contents from given json string
         store them in the index map """
     json_object = json.loads(json_string)
     url = json_object['url']
     html = json_object['content']
     df = DF(html)
     words = df.get_words()
     biwords = df.get_biwords()
     triwords = df.get_triwords()
     positions = df.get_position()
     checksum = df.get_checksum()
     # === check duplicate ===
     self.check_duplicate(id, checksum)
     # =======================
     self.map_doc_id[id] = url
     for word, count in words.items():
         posting = Posting(id, word, count, positions[word])
         self.map[word].append(posting)
     for biword,count in biwords.items():
         biword_posting = Posting(id, biword, count, 0)
         self.biword_map[biword].append(biword_posting)
     for triword, count in triwords.items():
         triword_posting = Posting(id, triword, count, 0)
         self.triword_map[triword].append(triword_posting)
Пример #11
0
 def ml_predict(self, issues, threshold=0.3):
     """
     This method applies machine learning algorithms to predict labels
     Args:
         issues(list): a list of issue numbers
         threshold(float): threshold of probability
     Return:
         ml_predictions(list of lists): predictions
     """
     # step1: fetch data
     DF = DataFetcher()
     df_test = DF.fetch_issues(issues)
     # step2: data cleaning
     SP = SentenceParser()
     SP.data = df_test
     SP.clean_body('body', True, True)
     SP.merge_column(['title', 'title', 'title', 'body'], 'train')
     test_text = SP.process_text('train', True, False, True)
     # step3: word embedding
     test_data_tfidf = self.tv.transform(test_text).toarray()
     le = LabelEncoder()
     le.fit_transform(self.labels)
     # step4: classification
     probs = self.clf.predict_proba(test_data_tfidf)
     # pick up top 2 predictions which exceeds threshold
     best_n = np.argsort(probs, axis=1)[:, -2:]
     ml_predictions = []
     for i in range(len(best_n)):
         # INFO:Predictor:issue:11919,Performance:0.47353076240017744,Question:0.2440056213336274
         logging.info("issue:{}, {}:{}, {}:{}".format(
             str(issues[i]), str(le.classes_[best_n[i][-1]]),
             str(probs[i][best_n[i][-1]]), str(le.classes_[best_n[i][-2]]),
             str(probs[i][best_n[i][-2]])))
         single_issue_predictions = [
             le.classes_[best_n[i][j]] for j in range(-1, -3, -1)
             if probs[i][best_n[i][j]] > threshold
         ]
         ml_predictions.append(single_issue_predictions)
     return ml_predictions
Пример #12
0
def getDataset():
    """
    Aquesta funció s'encarrega d'obtenir els datasets:
        * Primerament obté els paràmetres, comproba que són correctes i els formateja.
        * Informa que s'està baixant les dades.
        * Crea un diccionari de datasets amb tots els datasets obtinguts

    :return:
    """
    args = ArgsToDict()
    linkCommand = getArg(args, LinkCommand)
    sensorCommand = getArg(args, SensorCommand, [0])
    dayCommand = getArg(args, DayCommand)
    toDayCommand = getArg(args, ToDayCommand)

    sensorText = ", ".join(str(x) for x in sensorCommand)

    if (toDayCommand != DefaultValue):
        dayText = "from day " + dayCommand.strftime(
            "%d/%m/%y") + " to " + toDayCommand.strftime("%d/%m/%y")
    else:
        dayText = "of day " + dayCommand.strftime("%d/%m/%y")

    print "Downloading dataset from {0} of sensors: {1} {2} ...".format(
        linkCommand if linkCommand != DefaultValue else
        ("https://ocwitic.epsem.upc.edu/assignatures/tecpro/laboratori-material/dadespractica6/"
         ), sensorText, dayText)

    if (linkCommand == DefaultValue):
        df = DataFetcher()
    else:
        df = DataFetcher(linkCommand)

    datasetDict = dict()
    if (toDayCommand == DefaultValue):
        for sensor in sensorCommand:
            datasetDict[sensor] = df.fetch(dayCommand, sensor)
    else:
        for sensor in sensorCommand:
            datasetDict[sensor] = df.fetch_interval(dayCommand, toDayCommand,
                                                    sensor)

    return datasetDict
Пример #13
0
from config import FLAGS
from DataFetcher import DataFetcher
import pickle

os.environ['CUDA_VISIBLE_DEVICES']='2'
test_top_k = True
test_range_query = True

# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)


# Load data
data_fetcher = DataFetcher(FLAGS.dataset)
dataset = tf.data.Dataset.from_generator(data_fetcher.get_train_data, 
                                         (tf.int64, tf.float32, tf.int64,
                                          tf.int64, tf.float32, tf.int64,
                                          tf.int32, tf.float32, tf.float32), 
                                          (tf.TensorShape([None,2]), 
                                           tf.TensorShape([None]), 
                                           tf.TensorShape([2]),
                                           tf.TensorShape([None,2]), 
                                           tf.TensorShape([None]), 
                                           tf.TensorShape([2]),
                                           tf.TensorShape([(1+FLAGS.k)*FLAGS.batchsize]),
                                           tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]),
                                           tf.TensorShape([FLAGS.batchsize, FLAGS.k]),
                                           ))
dataset = dataset.prefetch(buffer_size=1)
Пример #14
0
import os

import random

from utils import *
from graphHashFunctions import GraphHash_Emb_Code
from config import FLAGS
from DataFetcher import DataFetcher

os.environ['CUDA_VISIBLE_DEVICES']='2'
config = tf.ConfigProto()
config.gpu_options.allow_growth = True


""" Load datafetcher only to get node_feature_dim, probably should use more efficient way to do that in future """
data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True, wrp_train_graph = False)
node_feature_dim = data_fetcher.get_node_feature_dim()
# Define placeholders
placeholders = {
    'support': tf.sparse_placeholder(tf.float32),
    'features': tf.sparse_placeholder(tf.float32, shape=(None, node_feature_dim)),
#    'labels': tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)),
    'dropout': tf.placeholder_with_default(0., shape=()),
    'graph_sizes': tf.placeholder(tf.int32, shape=(1)),
#    'graph_sizes': tf.placeholder(tf.int32, shape=(FLAGS.batchsize*(1+FLAGS.k))),
#    'generated_labels':tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.k)),
    'thres':tf.placeholder(tf.float32, shape=(FLAGS.hash_code_len))
}

thres = np.zeros(FLAGS.hash_code_len)
Пример #15
0
    # cosine similarity + 5 / (longitude-based distance between two locations)
    if np.linalg.norm(t_a_p) == 0. or np.linalg.norm(t_b_p) == 0.:
      return 0.
    else:
      return np.dot(t_a_p, t_b_p) / (np.linalg.norm(t_a_p) * np.linalg.norm(t_b_p)) + 5. / (sqrt((lat_a - lat_b) ** 2 + (lng_a - lng_b) ** 2) + 5.)

  def get_all_feature_vectors(self):
    return self.id_to_vec_dict

  def word_list_from_tweet(self, tweet):
    return tweet.lower().strip().split()

# for testing
if __name__ == '__main__':

  data_fetcher = DataFetcher()
  tweets = data_fetcher.run_all()
  print len(tweets)
  tp = TweetsProcessor(tweets)
  tp.prepare()

  print 'Start testing mode, input a fake tweet, for example:'
  print 'hello what is your name I am fine thank you USA america I am a happy girl, will return'
  similar_tweets = tp.get_similar_tweets('hello what is your name I am fine thank you USA america I am a happy girl')
  print '============================'
  print 'Similar Ones:'
  for tweet in similar_tweets:
    print tweet
    print '----------------------------'
  print 'Try it yourself!'
  print ''
Пример #16
0
#!/usr/bin/python3

from Environment import Environment
from DataFetcher import DataFetcher

from requests import post
from typing import List, Tuple, Dict, Any

if __name__ == '__main__':
    DataFetcher.write_csv_data_to_file_system()

    environment: Environment = Environment('Data', 'JSON', 'Plots', 'Logs')

    environment.begin_log()

    linear_regression_training_results: List[
        Tuple[Any]] = environment.linear_regression(5000, 0.1)
    polynomial_regression_training_results: List[
        Tuple[Any]] = environment.polynomial_regression(5000, 0.1, 10, 10)
    logistic_polynomial_regression_training_results: List[
        Tuple[Any]] = environment.logistic_polynomial_regression(5000, 0.1)

    environment.end_log()

    training_results_json_object: Dict[str, Dict[str, Dict[str, Dict[str, Dict[
        str, Any]]]]] = environment.create_training_results_json([
            (linear_regression_training_results, 'Linear'),
            (polynomial_regression_training_results, 'Polynomial'),
            (logistic_polynomial_regression_training_results,
             'Logistic Polynomial')
        ])
Пример #17
0
#!/usr/bin/python -u
import sys
import requests
from DataFetcher import DataFetcher
from dotenv import load_dotenv

load_dotenv()

dataFet = DataFetcher()
dataFet.init()
dataFet.fetch()
Пример #18
0
    def test_data_fetch_and_sample(self):
        fetcher = DataFetcher(filepath)
        metadata = fetcher._DataFetcher__fetch_metadata()
        metadata = metadata.sample(n=5)

        # test data fetch & selection for processed data
        left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False)
        for component in fetcher.get_comp_order():
            assert component in left.keys() and component in right.keys(
            ), "Error: {} not available in the dictionary.".format(component)
            assert left[component]["SESSION_ID"].isin(
                metadata["SESSION_ID"]
            ).all(
            ) == True, "Fetched data that was not requested for component {}.".format(
                component)
            assert right[component]["SESSION_ID"].isin(
                metadata["SESSION_ID"]
            ).all(
            ) == True, "Fetched data that was not requested for component {}.".format(
                component)
        filelist = [
            "GRF_COP_AP_", "GRF_COP_ML_", "GRF_F_AP_", "GRF_F_ML_", "GRF_F_V_"
        ]
        for element in filelist:
            component_name = element[element.index("_") + 1:-1].lower()
            test_data = pd.read_csv(filepath + "/" + element + "PRO_right.csv")
            test_data = test_data.astype({test_data.columns[3]: 'float64'})
            test_data = test_data[test_data["SESSION_ID"].isin(
                metadata["SESSION_ID"])].reset_index(drop=True)
            assert test_data.equals(
                right[component_name]
            ), "Data for {} component differs from original one.".format(
                component_name)

        # test sampling for processed data
        left_sampled = _sample(left, stepsize=1, raw=False)
        assert _DFdicts_equal(
            left_sampled, left), "There should be no sampling if stepsize = 1."
        left_sampled1 = _sample(left, stepsize=2, raw=False)
        left_sampled2 = _sample(left, stepsize=3, raw=False)
        left_sampled3 = _sample(left, stepsize=10, raw=False)
        for component in left_sampled.keys():
            assert left_sampled1[component].shape[1] == (int)(np.ceil(
                (left[component].shape[1] - 3) /
                2)) + 3, "Size after sampling not appropriate."
            assert left_sampled2[component].shape[1] == (int)(np.ceil(
                (left[component].shape[1] - 3) /
                3)) + 3, "Size after sampling not appropriate."
            assert left_sampled3[component].shape[1] == (int)(np.ceil(
                (left[component].shape[1] - 3) /
                10)) + 3, "Size after sampling not appropriate."
            time_steps = left_sampled2[component].shape[1]
            for i in range(3, time_steps):
                assert left_sampled2[component].iloc[:, i].equals(
                    left[component].iloc[:, i + 2 * (i - 3)]
                ), "Sampled data does not match original data."

        # test data fetch & selection for raw data
        left, right = fetcher._DataFetcher__fetch_data(metadata, raw=True)
        for component in fetcher.get_comp_order():
            assert component in left.keys() and component in right.keys(
            ), "Error: {} not available in the dictionary.".format(component)
            assert left[component]["SESSION_ID"].isin(
                metadata["SESSION_ID"]
            ).all(
            ) == True, "Fetched data that was not requested for component {}.".format(
                component)
            assert right[component]["SESSION_ID"].isin(
                metadata["SESSION_ID"]
            ).all(
            ) == True, "Fetched data that was not requested for component {}.".format(
                component)
        filelist = [
            "GRF_COP_AP_", "GRF_COP_ML_", "GRF_F_AP_", "GRF_F_ML_", "GRF_F_V_"
        ]
        for element in filelist:
            component_name = element[element.index("_") + 1:-1].lower()
            test_data = pd.read_csv(filepath + "/" + element + "RAW_right.csv")
            test_data = test_data[test_data["SESSION_ID"].isin(
                metadata["SESSION_ID"])].reset_index(drop=True)
            assert test_data.equals(
                right[component_name]
            ), "Data for {} component differs from original one.".format(
                component_name)

        # test sampling for raw data
        left_sampled1 = _sample(left, stepsize=2, raw=True)
        left_sampled2 = _sample(left, stepsize=3, raw=True)
        left_sampled3 = _sample(left, stepsize=10, raw=True)
        for component in left_sampled1.keys():
            assert left_sampled1[component].shape[1] == (
                int)(100 / 2 + 3), "Size after sampling not appropriate."
            assert left_sampled2[component].shape[1] == (
                int)(100 / 3 + 3), "Size after sampling not appropriate."
            assert left_sampled3[component].shape[1] == (
                int)(100 / 10 + 3), "Size after sampling not appropriate."
Пример #19
0
from DataFetcher import DataFetcher
D = DataFetcher('PTC')

a = D.get_data_train_in_clique(1)
a = D.get_data_train_in_clique(1)
a = D.get_data_train_in_clique(1)
Пример #20
0
from graphHashFunctions import GraphHash_Naive
import numpy as np
from config import FLAGS
from DataFetcher import DataFetcher
import pickle


os.environ['CUDA_VISIBLE_DEVICES']='0'

# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# Load data
data_fetcher = DataFetcher(FLAGS.dataset)

# Some preprocessing

# Define placeholders
placeholders = {
    'support': tf.sparse_placeholder(tf.float32),
    'features': tf.sparse_placeholder(tf.float32, shape=(None, data_fetcher.get_node_feature_dim())),
    'labels': tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)),
    'dropout': tf.placeholder_with_default(0., shape=()),
    'num_features_nonzero': tf.placeholder(tf.int32),  # helper variable for sparse dropout
    'graph_sizes': tf.placeholder(tf.int32, shape=((1+FLAGS.k)*FLAGS.batchsize)),
    'learning_rate': tf.placeholder(tf.float32, shape=())
}

# Create model
Пример #21
0
config.gpu_options.allow_growth = True

MinGED = 0
MaxGED = 11
MaxGraphNum = 4999900

if len(sys.argv) != 3:
    print('parameters are model_path, output_name')
    os._exit(0)
model_path = str(sys.argv[1])
output_fname = str(sys.argv[2])


""" Load datafetcher and model"""
print('restoring model...')
data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True, max_graph_num=MaxGraphNum)
node_feature_dim = data_fetcher.get_node_feature_dim()
# Define placeholders
placeholders = {
    'support': tf.sparse_placeholder(tf.float32),
    'features': tf.sparse_placeholder(tf.float32, shape=(None, node_feature_dim)),
#    'labels': tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)),
    'dropout': tf.placeholder_with_default(0., shape=()),
    'graph_sizes': tf.placeholder(tf.int32, shape=(FLAGS.ecd_batchsize)),
#    'graph_sizes': tf.placeholder(tf.int32, shape=(FLAGS.batchsize*(1+FLAGS.k))),
#    'generated_labels':tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.k)),
    'thres':tf.placeholder(tf.float32, shape=(FLAGS.hash_code_len))
}

thres = np.zeros(FLAGS.hash_code_len)
Пример #22
0
    def test_average_trials_and_scale(self):
        fetcher = DataFetcher(filepath)
        metadata = fetcher._DataFetcher__fetch_metadata()
        metadata = metadata.sample(n=5)

        # test averaging
        left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False)
        left_avg = _average_trials(left)
        for component in left.keys():
            assert left_avg[component].shape[
                0] == 5, "Averaging did not reduce data to unique SESSION_IDs."
            id = left_avg[component].iloc[0, :]["SESSION_ID"]
            left[component] = left[component][left[component]["SESSION_ID"] ==
                                              id]
            left_avg[component] = left_avg[component][left_avg[component]
                                                      ["SESSION_ID"] == id]
            assert left_avg[component].iloc[:, 2:].equals(
                left[component].iloc[:, 3:].mean(axis=0).to_frame().transpose(
                )), "Averaging did not produce the expected values."

        # test scaling for averaged data
        right_avg = _average_trials(_sample(right, stepsize=10, raw=False))
        left_avg = _average_trials(_sample(left, stepsize=10, raw=False))
        scaler = GRFScaler(featureRange=(0, 1))
        _fit_scaler(scaler, (right_avg, left_avg))
        right_scaled = _scale(scaler, right_avg)
        for component in right_scaled.keys():
            assert right_scaled[component].shape == right_avg[
                component].shape, "Shape does not match after scaling."
            data1 = right_avg[component].iloc[:, 2:]
            data2 = left_avg[component].iloc[:, 2:]
            dmin = min([data1.values.min(), data2.values.min()])
            dmax = max([data1.values.max(), data2.values.max()])
            data1 = data1.applymap(lambda x: (x - dmin) / (dmax - dmin))
            assert np.allclose(
                right_scaled[component].iloc[:, 2:].values,
                data1.values,
                rtol=1e-4,
                atol=1e-8), "Scaling does not produce the expected result."
            assert right_scaled[component].iloc[:, :2].equals(
                right_avg[component].iloc[:, :2]
            ), "Scaling messes up the meta-information columns."

        # test scaling for non-averaged data
        scaler = GRFScaler(featureRange=(0, 1))
        _fit_scaler(scaler, (right, left))
        left_scaled = _scale(scaler, left)
        for component in left_scaled.keys():
            assert left_scaled[component].shape == left[
                component].shape, "Shape does not match after scaling."
            data1 = right[component].iloc[:, 3:]
            data2 = left[component].iloc[:, 3:]
            dmin = min([data1.values.min(), data2.values.min()])
            dmax = max([data1.values.max(), data2.values.max()])
            data2 = data2.applymap(lambda x: (x - dmin) / (dmax - dmin))
            assert np.allclose(
                left_scaled[component].iloc[:, 3:].values,
                data2.values,
                rtol=1e-4,
                atol=1e-8), "Scaling does not produce the expected result."
            assert left_scaled[component].iloc[:, :3].equals(
                left[component].iloc[:, :3]
            ), "Scaling messes up the meta-information columns."
Пример #23
0
class TestLabelBot(unittest.TestCase):
    def setUp(self):
        self.df = DataFetcher()
        self.df.repo = "apache/incubator-mxnet"
        self.df.github_user = "******"
        self.df.github_oauth_token = "123"

    def tearDown(self):
        pass

    def test_cleanstr(self):
        new_string = self.df.cleanstr("a_b", "")
        self.assertEqual(new_string, "ab")

    def test_count_pages(self):
        with patch('DataFetcher.requests.get') as mocked_get:
            mocked_get.return_value.status_code = 200
            mocked_get.return_value.json.return_value = [{
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11925,
                "labels": [{
                    'name': 'Doc'
                }],
                "state":
                "open",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }, {
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11924,
                "labels": [],
                "state":
                "closed",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }]
            page = self.df.count_pages('all')
            self.assertEqual(page, 1)

    def test_fetch_issues(self):
        with patch('DataFetcher.requests.get') as mocked_get:
            mocked_get.return_value.status_code = 200
            mocked_get.return_value.json.return_value = {
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11925,
                "labels": [{
                    'name': 'Feature'
                }],
                "state":
                "open",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }
            data = self.df.fetch_issues([11925])
            expected_data = [{
                'id': "11925",
                'title': "issue's title",
                'body': "issue's body"
            }]
            assert_frame_equal(data, pd.DataFrame(expected_data))

    def test_data2json(self):
        with patch('DataFetcher.requests.get') as mocked_get:
            mocked_get.return_value.status_code = 200
            mocked_get.return_value.json.return_value = [{
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11925,
                "labels": [{
                    'name': 'Feature'
                }],
                "state":
                "open",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }, {
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11924,
                "labels": [],
                "state":
                "closed",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }]
            self.df.data2json('all', labels=["Feature"], other_labels=False)
            expected_data = [{
                'id': 11925,
                'title': "issue's title",
                'body': "issue's body",
                'labels': 'Feature'
            }]
            self.assertEqual(expected_data, self.df.json_data)
Пример #24
0
model_name = "0211_All_"+FLAGS.dataset
model_path = "SavedModel/"+model_name+".ckpt"
saved_files_dir = "SavedModel"

""" Set random seed """
random_seed = 123
np.random.seed(random_seed)
tf.set_random_seed(random_seed)
seed(random_seed)



""" Create data fetcher """

#data_fetcher = DataFetcher(dataset = FLAGS.dataset, exact_ged = True)
data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True)
# wrap the data fetcher with tensorflow.dataset.prefetch to accelerate training
dataset = tf.data.Dataset.from_generator(data_fetcher.get_train_data, 
                                         (tf.int64, tf.float32, tf.int64,
                                          tf.int64, tf.float32, tf.int64,
                                          tf.int32, tf.float32, tf.float32), 
                                          (tf.TensorShape([None,2]),#feature, sparse index
                                           tf.TensorShape([None]), # feature sparse value
                                           tf.TensorShape([2]), # feature shape
                                           tf.TensorShape([None,2]),# laplacian sparse index
                                           tf.TensorShape([None]), #laplacian sparse value
                                           tf.TensorShape([2]), # laplacian sparse shape
                                           tf.TensorShape([(1+FLAGS.k)*FLAGS.batchsize]), #shape
                                           #tf.TensorShape([None]), # shape
                                           tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]),#label
                                           tf.TensorShape([FLAGS.batchsize, FLAGS.k]),# gen_label
class AlgoManager():
    '''
    attribute
    self.StudyIDs
    '''
    def __init__(self, studyID, use_ssh=True):
        #create a datafetcher instance to fetch the data from the database
        self.data_fetcher = DataFetcher(use_ssh)

        self.n_bins = 10

        self.queryStudyID = studyID

    def get_contours_by_id(self, roi_index):
        return self.data_fetcher.get_contours_by_id(self.queryStudyID, roi_index)

    def feature_extraction(self):
        '''
        call ovh, sts and td to get the ovh sts and td features
        :param StudyID:
        :return ovh: a histogram of ovh feature
        :return sts: a histogram of sts feature
        :return td: target dose
        '''
        #Both PTV and OAR are dictionary
        PTV,OAR = self.data_fetcher.get_contours(self.queryStudyID)
        
        # Check that PTV has been found
        assert len(PTV.keys()) > 0 , "PTV NOT FOUND"

        row_spacing, column_spacing, slice_thickness = self.data_fetcher.get_spacing(self.queryStudyID)
        pixel_spacing = self.data_fetcher.get_pixel_spacing(self.queryStudyID)

        for ptv_name,ptv_tuple in PTV.items():
            for oar_name,oar_tuple in OAR.items():
                #in the tuple, the first one is contour block and the second one is roi block
                print("process the pair")
                oar_contour_block = oar_tuple[0]
                oar_roi_block = oar_tuple[1]

                ptv_contour_block = ptv_tuple[0]
                ptv_roi_block = ptv_tuple[1]
                bin_vals, bin_amts = getOVH(oar_roi_block, ptv_contour_block, ptv_roi_block, pixel_spacing,
                            row_spacing, column_spacing, slice_thickness, self.n_bins)

                ovh_hist = (bin_vals, bin_amts)

                # print("Get ovh {}".format(ovh_hist))
                print("OVH Done")
                elevation_bins, distance_bins, azimuth_bins, amounts = getSTSHistogram(ptv_roi_block, oar_roi_block, self.n_bins)
                sts_hist = (elevation_bins, distance_bins, azimuth_bins, amounts)

                print("STS Done")
                # print("Get Sts {}".format(sts_hist))

                self.data_fetcher.save_ovh(ptv_name,oar_name,ovh_hist,self.queryStudyID)
                self.data_fetcher.save_sts(ptv_name,oar_name,sts_hist,self.queryStudyID)

                print("Saved OVH and STS")
        pass

    def generate_pairs(self,queryStudy,dbStudy):
        '''
        match the queryStudy with dbStudy to generate pairs
        :param queryStudy: a dictionary, key is the name of OAR, the value is the histogram
        :param dbStudy: a dictionary, key is the name of OAR, the value is the histogram
        :return:
        {
            oar_id: (hist_query,hist_db)
        }
        '''
        queryKeys = set(queryStudy.keys())
        dbKeys = set(dbStudy.keys())
        mergedKeys = queryKeys.intersection(dbKeys)
        mergedDict = defaultdict()
        for key in mergedKeys:
            query_tuple = []
            for block in queryStudy[key]:
                # process amounts (2d array) separately
                if "]" in block:
                    query_values = block.replace("]", " ").replace("[", " ").replace(",", " ").split(" ")
                    query_array = np.zeros(shape=((self.n_bins ** 3 * 4)), dtype=np.float64)
                    
                    count = 0
                    for i, val in enumerate(query_values):
                        if val:
                            query_array[count] = float(val.strip())
                            count +=1
                    query_array = query_array.reshape((self.n_bins ** 3, 4))
                    if count != self.n_bins ** 3 * 4:
                        import pdb ; pdb.set_trace()
                    assert count == self.n_bins ** 3 * 4, "invalid parsed STS values"
                else:
                    query_values = block.split(",")
                    query_array = np.zeros(shape=(len(query_values)), dtype=np.float64)
                    for i, val in enumerate(query_values):
                        query_array[i] = float(val)
                query_tuple.append(query_array)
            
            historical_tuple = []
            for block in dbStudy[key]:
                # process amounts (2d array) separately
                if "]" in block:
                    query_values = block.replace("]", " ").replace("[", " ").replace(",", " ").split(" ")
                    query_array = np.zeros(shape=((self.n_bins ** 3 * 4)), dtype=np.float64)
                    
                    count = 0
                    for i, val in enumerate(query_values):
                        if val:
                            query_array[count] = float(val.strip())
                            count +=1
                    query_array = query_array.reshape((self.n_bins ** 3, 4))
                    assert count == self.n_bins ** 3 * 4, "invalid parsed STS values"
                else:
                    query_values = block.split(",")
                    query_array = np.zeros(shape=(len(query_values)), dtype=np.float64)
                    for i, val in enumerate(query_values):
                        query_array[i] = float(val)
                historical_tuple.append(query_array)

            mergedDict[key] = (query_tuple, historical_tuple)

        return mergedDict

    def similarity_calculation(self):
        '''
        fetch ovh and STS features of other study
        calculate dissimilarity between features
        calculate similarity between study pair
        :return: dict with dissimiarity and similarity
        '''
        queryOVH = self.data_fetcher.get_ovh(self.queryStudyID)
        querySTS = self.data_fetcher.get_sts(self.queryStudyID)

        self.DBStudy_list = self.data_fetcher.get_dbstudy_list(self.queryStudyID)

        for studyID in self.DBStudy_list:
            historical_id = studyID["id"]
            dbOVH = self.data_fetcher.get_ovh(str(historical_id))
            ovh_pairs = self.generate_pairs(queryOVH,dbOVH)

            dbSTS = self.data_fetcher.get_sts(str(historical_id))
            
            sts_pairs = self.generate_pairs(querySTS,dbSTS)

            keys = ovh_pairs.keys()
            if len(keys) > 0:
                print("Processing similar pairs")

            for key in keys:
                ovh_item = ovh_pairs[key]
                ovh_dis = getOVHEmd(ovh_item[0][0],ovh_item[0][1],ovh_item[1][0],ovh_item[1][1])
                sts_item = sts_pairs[key]
                sts_dis = getSTSEmd(sts_item[0][3], sts_item[1][3])

                # Get PTV target dose
                query_target_dose = self.data_fetcher.get_target_dose(self.queryStudyID, int(key.split(" ")[1]))
                historical_target_dose = self.data_fetcher.get_target_dose(historical_id, int(key.split(" ")[1]))

                self.data_fetcher.save_similarity(str(historical_id), query_target_dose - 
                        historical_target_dose, str(ovh_dis), str(sts_dis), key.split(" ")[0], 
                    key.split(" ")[-1], str(historical_id), self.queryStudyID)


    #The entrance of the program
    def run(self):
        #extract OVH and STS for new case
        #store the OVH and STS
        #fetch OVH and STS of other cases
        #Do the similarity calculation
        #Save the result to database

        #Store the StudyID of all DB studies for future similarity calculation
        self.DBStudy_list = self.data_fetcher.get_dbstudy_list(self.queryStudyID)

        #calculate ovh,sts and save it to database
        self.feature_extraction()

        self.similarity_calculation()
Пример #26
0
config.gpu_options.allow_growth = True

test_top_k = True
test_range_query = True

# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

train = True

#chkp.print_tensors_in_checkpoint_file("SavedModel/model_rank.ckpt", tensor_name='', all_tensors=True, all_tensor_names=True)

# Load data
data_fetcher = DataFetcher(FLAGS.dataset, True)
dataset = tf.data.Dataset.from_generator(
    data_fetcher.get_train_data,
    (tf.int64, tf.float32, tf.int64, tf.int64, tf.float32, tf.int64, tf.int32,
     tf.float32, tf.float32), (
         tf.TensorShape([None, 2]),
         tf.TensorShape([None]),
         tf.TensorShape([2]),
         tf.TensorShape([None, 2]),
         tf.TensorShape([None]),
         tf.TensorShape([2]),
         tf.TensorShape([(1 + FLAGS.k) * FLAGS.batchsize]),
         tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]),
         tf.TensorShape([FLAGS.batchsize, FLAGS.k]),
     ))
dataset = dataset.prefetch(buffer_size=1)
Пример #27
0
from utils import get_similar_graphs_gid, get_top_k_similar_graphs_gid
from graphHashFunctions import GraphHash_Rank_Reg
import numpy as np
from config import FLAGS
from DataFetcher import DataFetcher
import pickle

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

# Load data
data_fetcher = DataFetcher(FLAGS.dataset)

# Some preprocessing

# Define placeholders
placeholders = {
    'support':
    tf.sparse_placeholder(tf.float32),
    'features':
    tf.sparse_placeholder(tf.float32,
                          shape=(None, data_fetcher.get_node_feature_dim())),
    'labels':
    tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)),
    'dropout':
    tf.placeholder_with_default(0., shape=()),
    'num_features_nonzero':
Пример #28
0
 def setUp(self):
     self.df = DataFetcher()
     self.df.repo = "apache/incubator-mxnet"
     self.df.github_user = "******"
     self.df.github_oauth_token = "123"
Пример #29
0
    def test_arrange_and_format(self):
        fetcher = DataFetcher(filepath)
        metadata = fetcher._DataFetcher__fetch_metadata()
        metadata = metadata.sample(n=5)
        left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False)

        affected, non_affected = _arrange_data(left, right, metadata, 1)
        leftSideAffected = metadata[metadata["AFFECTED_SIDE"] == 0]
        rightSideAffected = metadata[metadata["AFFECTED_SIDE"] == 1]

        for component in affected.keys():
            assert affected[component][affected[component]["SESSION_ID"].isin(
                leftSideAffected["SESSION_ID"])].iloc[:, :100].equals(
                    left[component][left[component]["SESSION_ID"].isin(
                        leftSideAffected["SESSION_ID"])].iloc[:, :100]
                ), "Did not assign affected leg correctly (left)."
            assert non_affected[component][
                non_affected[component]["SESSION_ID"].isin(
                    rightSideAffected["SESSION_ID"])].iloc[:, :100].equals(
                        left[component][left[component]["SESSION_ID"].isin(
                            rightSideAffected["SESSION_ID"])].iloc[:, :100]
                    ), "Did not assign unaffected leg correctly (left)."
            assert not (
                affected[component].iloc[:, :100]
                == non_affected[component].iloc[:, :100]).all(axis=1).any(
                ), "Affected and unaffected side have the same data."
            assert affected[component][affected[component]["SESSION_ID"].isin(
                rightSideAffected["SESSION_ID"])].iloc[:, :100].equals(
                    right[component][right[component]["SESSION_ID"].isin(
                        rightSideAffected["SESSION_ID"])].iloc[:, :100]
                ), "Did not assign affected leg correctly (right)."
            assert non_affected[component][
                non_affected[component]["SESSION_ID"].isin(
                    leftSideAffected["SESSION_ID"])].iloc[:, :100].equals(
                        right[component][right[component]["SESSION_ID"].isin(
                            leftSideAffected["SESSION_ID"])].iloc[:, :100]
                    ), "Did not assign unaffected leg correctly (right)."

        data = fetcher._DataFetcher__split_and_format(affected, non_affected)
        component = list(affected.keys())[0]
        assert np.equal(
            data["label"], affected[component]["CLASS_LABEL"].map({
                "HC": 0,
                "H": 1,
                "K": 2,
                "A": 3,
                "C": 4
            }).values).all(), "Class labels do not match."
        assert data["affected"].shape == (
            affected[component].shape[0], 101,
            5), "Output shape incorrect (affected)."
        assert data["non_affected"].shape == (
            affected[component].shape[0], 101,
            5), "Output shape incorrect (non_affected)."

        comp_list = list(affected.keys())
        for i in range(5):
            assert np.equal(data["affected"][:, :, i],
                            np.asarray(affected[comp_list[i]].iloc[:, 3:104]),
                            dtype=np.float32).all(
                            ), "Data for {} component does not match".format(
                                comp_list[i])
Пример #30
0
""" train the model or load existing model """
train = True
model_name = "0902:code_emb_ours_" + FLAGS.dataset
model_path = "SavedModel/" + model_name + ".ckpt"
saved_files_dir = "SavedModel"
""" Set random seed """
random_seed = 123
np.random.seed(random_seed)
tf.set_random_seed(random_seed)
seed(random_seed)
""" Create data fetcher """

if use_csm:
    assert (FLAGS.dataset == CSM_FLAGS.csm_dataset)
#data_fetcher = DataFetcher(dataset = FLAGS.dataset, exact_ged = True)
data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True)
# wrap the data fetcher with tensorflow.dataset.prefetch to accelerate training
dataset = tf.data.Dataset.from_generator(
    data_fetcher.get_train_data,
    (tf.int64, tf.float32, tf.int64, tf.int64, tf.float32, tf.int64, tf.int32,
     tf.float32, tf.float32),
    (
        tf.TensorShape([None, 2]),  #feature, sparse index
        tf.TensorShape([None]),  # feature sparse value
        tf.TensorShape([2]),  # feature shape
        tf.TensorShape([None, 2]),  # laplacian sparse index
        tf.TensorShape([None]),  #laplacian sparse value
        tf.TensorShape([2]),  # laplacian sparse shape
        tf.TensorShape([(1 + FLAGS.k) * FLAGS.batchsize]),  #shape
        #tf.TensorShape([None]), # shape
        tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]),  #label