def test_init(self): fetcher = DataFetcher(filepath) fetcher = DataFetcher(filepath + "/") with self.assertRaises(IOError): fetcher = DataFetcher(filepath + "/test") fetcher = DataFetcher("/test") fetcher = DataFetcher("")
def __init__(self, studyID, use_ssh=True): #create a datafetcher instance to fetch the data from the database self.data_fetcher = DataFetcher(use_ssh) self.n_bins = 10 self.queryStudyID = studyID
def main(Selected_Stock, Trading_Day): fetcher = DataFetcher() fetch_result = fetcher.getHistoricalData(Selected_Stock) if fetch_result == -1: raise Exception("NO INTERNET CONNECTIVITY OR INVALID STOCK SYMBOL") dir_name = os.path.dirname(os.path.abspath(__file__)) CSVFile = os.path.join(dir_name, "Dataset", Selected_Stock + ".csv") ohclv_data = list(getData(CSVFile)) #current_data = regression(ohclv_data) #ohclv_data.append(current_data) ohclv_data = np.array(ohclv_data) X, y = prepareData(ohclv_data, Trading_Day) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y) model = RandomForestClassifier(n_estimators=35, criterion="gini") model.fit(Xtrain, ytrain) y_pred = model.predict(Xtest) output = model.predict(X[-1].reshape(1, -1)) Eval = Evaluator(Xtest, ytest, y_pred, model) accuracy, recall, precision, specificity = Eval.getPerformanceMetrics() print accuracy print recall print precision print specificity raw_input("Press enter to genereate OOB vs Number of estimators graph:") Eval.oob_vs_n_trees(100, Xtrain, ytrain)
def rule_based(self, issues): """ This method applies rule_based algorithms to predict labels Args: issues(list): a list of issue numbers Return: rule_based_predictions(list of lists): labels which satisfy rules """ DF = DataFetcher() df_test = DF.fetch_issues(issues) rule_based_predictions = [] for i in range(len(issues)): # extract every issue's title row = df_test.loc[i, 'title'] # apply rule-based algorithms single_issue_predictions = [] if "feature request" in row.lower(): single_issue_predictions.append("Feature") if "c++" in row.lower(): single_issue_predictions.append("C++") tokens = self.tokenize(row) for k, v in self.keywords.items(): for keyword in v: if keyword in tokens: single_issue_predictions.append(k) rule_based_predictions.append(single_issue_predictions) return rule_based_predictions
def main(Selected_Stock, Trading_Day): fetcher = DataFetcher() #fetch_result = fetcher.getHistoricalData(Selected_Stock) dir_name = os.path.dirname(os.path.abspath(__file__)) CSVFile = os.path.join(dir_name,Selected_Stock + ".csv") if os.path.isfile(CSVFile): last_modified_date = datetime.fromtimestamp(os.path.getmtime(CSVFile)) current_data = fetcher.getCurrentData(Selected_Stock,"ohclv") ohclv_data = list(getData(CSVFile)) ohclv_data.append(current_data) ohclv_data = np.array(ohclv_data) X,y = prepareData(ohclv_data, Trading_Day) Xtrain,Xtest,ytrain,ytest = train_test_split(X,y) model = RandomForestClassifier(n_estimators = 30,criterion = "gini") model.fit(Xtrain,ytrain) y_pred = model.predict(Xtest) accuracy = accuracy_score(ytest,y_pred) precision = precision_score(ytest,y_pred) recall = recall_score(ytest,y_pred) specificity = specificity_score(ytest,y_pred) output = model.predict(X[-1].reshape(1,-1)) return accuracy, output,current_data,precision,recall, specificity
def main(): fetcher = DataFetcher("cookies.txt") data = fetcher.getItems( "https://www.couchsurfing.com/members/hosts?utf8=%E2%9C%93&search_query=Curitiba%2C+Brazil&latitude=-25.4244287&longitude=-49.2653819&country=Brazil®ion=south-america&date_modal_dismissed=true&arrival_date=&departure_date=&num_guests=1&has_references=1&can_host%5Baccepting_guests%5D=1&last_login=Anytime&join_date=Anytime&gender=All&min_age=&max_age=&languages_spoken=&interests=&smoking=No+Preference&radius=10&keyword=&host_sort=Best+Match&button=&perPage=100", "h3", className="-name") usuarios = [ Host(u.a.string, u.a['href'][len("/users/"):], u.a['href']) for u in data ] arquivo = open("usuarios.csv", "w") arquivo_usuarios = csv.DictWriter(arquivo, fieldnames=["nome", "id", "endereco"], lineterminator='\n') arquivo_usuarios.writeheader() for user in usuarios: arquivo_usuarios.writerow({ 'nome': user.nome, 'id': user.id, 'endereco': user.endereco.strip() }) arquivo.close()
def init_tweetsprocessor(): global tweetsProcessor dataFetcher = DataFetcher() #tweets_dict = dataFetcher.run_all() dataFetcher.parse_data() tweets_dict = dataFetcher.get_dict() tweetsProcessor = TweetsProcessor(tweets_dict) tweetsProcessor.prepare()
def train(self): """ This method is to train and save models. It has 5 steps: 1. Fetch issues 2. Clean data 3. Word embedding 4. Train models 5. Save models """ logging.info("Start training issues of general labels") # Step1: Fetch issues with general labels logging.info("Fetching Data..") DF = DataFetcher() filename = DF.data2json('all', self.labels, False) # Step2: Clean data logging.info("Cleaning Data..") SP = SentenceParser() SP.read_file(filename, 'json') SP.clean_body('body', True, True) SP.merge_column(['title', 'title', 'title', 'body'], 'train') text = SP.process_text('train', True, False, True) df = SP.data # Step3: Word Embedding logging.info("Word Embedding..") # tv = TfidfVectorizer(min_df=0.00009, ngram_range=(1, 3), max_features=10000) tv = self.tv X = tv.fit_transform(text).toarray() # Labels labels = SP.data['labels'] le = LabelEncoder() Y = le.fit_transform(labels) # Step4: Train Classifier # SVC, kernel = 'rbf' logging.info("Training Data..") # clf = SVC(gamma=0.5, C=100, probability=True) clf = self.clf clf.fit(X, Y) # Step5: save models logging.info("Saving Models..") with open(os.path.join(self.tmp_dir.name, 'Vectorizer.p'), 'wb') as tv_file: pickle.dump(tv, tv_file) with open(os.path.join(self.tmp_dir.name, 'Classifier.p'), 'wb') as clf_file: pickle.dump(clf, clf_file) with open(os.path.join(self.tmp_dir.name, 'Labels.p'), 'wb') as labels_file: pickle.dump(labels, labels_file) logging.info("Completed!") return self.tmp_dir
def test_concat(self): fetcher = DataFetcher(filepath) metadata = fetcher._DataFetcher__fetch_metadata() metadata = metadata.sample(n=5) left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False) # test concat for averaged data left_avg = _average_trials(left) left_concat = fetcher._DataFetcher__concat(left_avg) order = fetcher.get_comp_order() assert len(left_concat.keys() ) == 1, "Too many keys in dictionary after concatenation." assert "concat" in left_concat.keys( ), "'concat' is no key in the dictionary." assert left_concat["concat"].iloc[:, :103].equals( left_avg[order[0]] ), "Component {} does not match in concatenated dict.".format(order[0]) nextIndex = 103 lastColumn = left_avg[order[0]].iloc[:, -1] for component in order[1:]: carry = lastColumn - left_avg[component].iloc[:, 2] assert left_concat["concat"].iloc[:, nextIndex:( nextIndex + 101)].equals( left_avg[component].iloc[:, 2:].add(carry, axis="index") ), "Component {} does not match in concatenated dict.".format( component) nextIndex += 101 lastColumn = left_avg[component].iloc[:, -1].add(carry, axis="index") # test for non-averaged data right_concat = fetcher._DataFetcher__concat(right) order = fetcher.get_comp_order() assert len(right_concat.keys() ) == 1, "Too many keys in dictionary after concatenation." assert "concat" in right_concat.keys( ), "'concat' is no key in the dictionary." assert right_concat["concat"].iloc[:, :104].equals( right[order[0]] ), "Component {} does not match in concatenated dict.".format(order[0]) nextIndex = 104 lastColumn = right[order[0]].iloc[:, -1] for component in order[1:]: carry = lastColumn - right[component].iloc[:, 3] assert right_concat["concat"].iloc[:, nextIndex:( nextIndex + 101)].equals( right[component].iloc[:, 3:].add(carry, axis="index") ), "Component {} does not match in concatenated dict.".format( component) nextIndex += 101 lastColumn = right[component].iloc[:, -1].add(carry, axis="index")
def fetch_content(self, id, json_string): """ fetch contents from given json string store them in the index map """ json_object = json.loads(json_string) url = json_object['url'] html = json_object['content'] df = DF(html) words = df.get_words() biwords = df.get_biwords() triwords = df.get_triwords() positions = df.get_position() checksum = df.get_checksum() # === check duplicate === self.check_duplicate(id, checksum) # ======================= self.map_doc_id[id] = url for word, count in words.items(): posting = Posting(id, word, count, positions[word]) self.map[word].append(posting) for biword,count in biwords.items(): biword_posting = Posting(id, biword, count, 0) self.biword_map[biword].append(biword_posting) for triword, count in triwords.items(): triword_posting = Posting(id, triword, count, 0) self.triword_map[triword].append(triword_posting)
def ml_predict(self, issues, threshold=0.3): """ This method applies machine learning algorithms to predict labels Args: issues(list): a list of issue numbers threshold(float): threshold of probability Return: ml_predictions(list of lists): predictions """ # step1: fetch data DF = DataFetcher() df_test = DF.fetch_issues(issues) # step2: data cleaning SP = SentenceParser() SP.data = df_test SP.clean_body('body', True, True) SP.merge_column(['title', 'title', 'title', 'body'], 'train') test_text = SP.process_text('train', True, False, True) # step3: word embedding test_data_tfidf = self.tv.transform(test_text).toarray() le = LabelEncoder() le.fit_transform(self.labels) # step4: classification probs = self.clf.predict_proba(test_data_tfidf) # pick up top 2 predictions which exceeds threshold best_n = np.argsort(probs, axis=1)[:, -2:] ml_predictions = [] for i in range(len(best_n)): # INFO:Predictor:issue:11919,Performance:0.47353076240017744,Question:0.2440056213336274 logging.info("issue:{}, {}:{}, {}:{}".format( str(issues[i]), str(le.classes_[best_n[i][-1]]), str(probs[i][best_n[i][-1]]), str(le.classes_[best_n[i][-2]]), str(probs[i][best_n[i][-2]]))) single_issue_predictions = [ le.classes_[best_n[i][j]] for j in range(-1, -3, -1) if probs[i][best_n[i][j]] > threshold ] ml_predictions.append(single_issue_predictions) return ml_predictions
def getDataset(): """ Aquesta funció s'encarrega d'obtenir els datasets: * Primerament obté els paràmetres, comproba que són correctes i els formateja. * Informa que s'està baixant les dades. * Crea un diccionari de datasets amb tots els datasets obtinguts :return: """ args = ArgsToDict() linkCommand = getArg(args, LinkCommand) sensorCommand = getArg(args, SensorCommand, [0]) dayCommand = getArg(args, DayCommand) toDayCommand = getArg(args, ToDayCommand) sensorText = ", ".join(str(x) for x in sensorCommand) if (toDayCommand != DefaultValue): dayText = "from day " + dayCommand.strftime( "%d/%m/%y") + " to " + toDayCommand.strftime("%d/%m/%y") else: dayText = "of day " + dayCommand.strftime("%d/%m/%y") print "Downloading dataset from {0} of sensors: {1} {2} ...".format( linkCommand if linkCommand != DefaultValue else ("https://ocwitic.epsem.upc.edu/assignatures/tecpro/laboratori-material/dadespractica6/" ), sensorText, dayText) if (linkCommand == DefaultValue): df = DataFetcher() else: df = DataFetcher(linkCommand) datasetDict = dict() if (toDayCommand == DefaultValue): for sensor in sensorCommand: datasetDict[sensor] = df.fetch(dayCommand, sensor) else: for sensor in sensorCommand: datasetDict[sensor] = df.fetch_interval(dayCommand, toDayCommand, sensor) return datasetDict
from config import FLAGS from DataFetcher import DataFetcher import pickle os.environ['CUDA_VISIBLE_DEVICES']='2' test_top_k = True test_range_query = True # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Load data data_fetcher = DataFetcher(FLAGS.dataset) dataset = tf.data.Dataset.from_generator(data_fetcher.get_train_data, (tf.int64, tf.float32, tf.int64, tf.int64, tf.float32, tf.int64, tf.int32, tf.float32, tf.float32), (tf.TensorShape([None,2]), tf.TensorShape([None]), tf.TensorShape([2]), tf.TensorShape([None,2]), tf.TensorShape([None]), tf.TensorShape([2]), tf.TensorShape([(1+FLAGS.k)*FLAGS.batchsize]), tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]), tf.TensorShape([FLAGS.batchsize, FLAGS.k]), )) dataset = dataset.prefetch(buffer_size=1)
import os import random from utils import * from graphHashFunctions import GraphHash_Emb_Code from config import FLAGS from DataFetcher import DataFetcher os.environ['CUDA_VISIBLE_DEVICES']='2' config = tf.ConfigProto() config.gpu_options.allow_growth = True """ Load datafetcher only to get node_feature_dim, probably should use more efficient way to do that in future """ data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True, wrp_train_graph = False) node_feature_dim = data_fetcher.get_node_feature_dim() # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32), 'features': tf.sparse_placeholder(tf.float32, shape=(None, node_feature_dim)), # 'labels': tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)), 'dropout': tf.placeholder_with_default(0., shape=()), 'graph_sizes': tf.placeholder(tf.int32, shape=(1)), # 'graph_sizes': tf.placeholder(tf.int32, shape=(FLAGS.batchsize*(1+FLAGS.k))), # 'generated_labels':tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.k)), 'thres':tf.placeholder(tf.float32, shape=(FLAGS.hash_code_len)) } thres = np.zeros(FLAGS.hash_code_len)
# cosine similarity + 5 / (longitude-based distance between two locations) if np.linalg.norm(t_a_p) == 0. or np.linalg.norm(t_b_p) == 0.: return 0. else: return np.dot(t_a_p, t_b_p) / (np.linalg.norm(t_a_p) * np.linalg.norm(t_b_p)) + 5. / (sqrt((lat_a - lat_b) ** 2 + (lng_a - lng_b) ** 2) + 5.) def get_all_feature_vectors(self): return self.id_to_vec_dict def word_list_from_tweet(self, tweet): return tweet.lower().strip().split() # for testing if __name__ == '__main__': data_fetcher = DataFetcher() tweets = data_fetcher.run_all() print len(tweets) tp = TweetsProcessor(tweets) tp.prepare() print 'Start testing mode, input a fake tweet, for example:' print 'hello what is your name I am fine thank you USA america I am a happy girl, will return' similar_tweets = tp.get_similar_tweets('hello what is your name I am fine thank you USA america I am a happy girl') print '============================' print 'Similar Ones:' for tweet in similar_tweets: print tweet print '----------------------------' print 'Try it yourself!' print ''
#!/usr/bin/python3 from Environment import Environment from DataFetcher import DataFetcher from requests import post from typing import List, Tuple, Dict, Any if __name__ == '__main__': DataFetcher.write_csv_data_to_file_system() environment: Environment = Environment('Data', 'JSON', 'Plots', 'Logs') environment.begin_log() linear_regression_training_results: List[ Tuple[Any]] = environment.linear_regression(5000, 0.1) polynomial_regression_training_results: List[ Tuple[Any]] = environment.polynomial_regression(5000, 0.1, 10, 10) logistic_polynomial_regression_training_results: List[ Tuple[Any]] = environment.logistic_polynomial_regression(5000, 0.1) environment.end_log() training_results_json_object: Dict[str, Dict[str, Dict[str, Dict[str, Dict[ str, Any]]]]] = environment.create_training_results_json([ (linear_regression_training_results, 'Linear'), (polynomial_regression_training_results, 'Polynomial'), (logistic_polynomial_regression_training_results, 'Logistic Polynomial') ])
#!/usr/bin/python -u import sys import requests from DataFetcher import DataFetcher from dotenv import load_dotenv load_dotenv() dataFet = DataFetcher() dataFet.init() dataFet.fetch()
def test_data_fetch_and_sample(self): fetcher = DataFetcher(filepath) metadata = fetcher._DataFetcher__fetch_metadata() metadata = metadata.sample(n=5) # test data fetch & selection for processed data left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False) for component in fetcher.get_comp_order(): assert component in left.keys() and component in right.keys( ), "Error: {} not available in the dictionary.".format(component) assert left[component]["SESSION_ID"].isin( metadata["SESSION_ID"] ).all( ) == True, "Fetched data that was not requested for component {}.".format( component) assert right[component]["SESSION_ID"].isin( metadata["SESSION_ID"] ).all( ) == True, "Fetched data that was not requested for component {}.".format( component) filelist = [ "GRF_COP_AP_", "GRF_COP_ML_", "GRF_F_AP_", "GRF_F_ML_", "GRF_F_V_" ] for element in filelist: component_name = element[element.index("_") + 1:-1].lower() test_data = pd.read_csv(filepath + "/" + element + "PRO_right.csv") test_data = test_data.astype({test_data.columns[3]: 'float64'}) test_data = test_data[test_data["SESSION_ID"].isin( metadata["SESSION_ID"])].reset_index(drop=True) assert test_data.equals( right[component_name] ), "Data for {} component differs from original one.".format( component_name) # test sampling for processed data left_sampled = _sample(left, stepsize=1, raw=False) assert _DFdicts_equal( left_sampled, left), "There should be no sampling if stepsize = 1." left_sampled1 = _sample(left, stepsize=2, raw=False) left_sampled2 = _sample(left, stepsize=3, raw=False) left_sampled3 = _sample(left, stepsize=10, raw=False) for component in left_sampled.keys(): assert left_sampled1[component].shape[1] == (int)(np.ceil( (left[component].shape[1] - 3) / 2)) + 3, "Size after sampling not appropriate." assert left_sampled2[component].shape[1] == (int)(np.ceil( (left[component].shape[1] - 3) / 3)) + 3, "Size after sampling not appropriate." assert left_sampled3[component].shape[1] == (int)(np.ceil( (left[component].shape[1] - 3) / 10)) + 3, "Size after sampling not appropriate." time_steps = left_sampled2[component].shape[1] for i in range(3, time_steps): assert left_sampled2[component].iloc[:, i].equals( left[component].iloc[:, i + 2 * (i - 3)] ), "Sampled data does not match original data." # test data fetch & selection for raw data left, right = fetcher._DataFetcher__fetch_data(metadata, raw=True) for component in fetcher.get_comp_order(): assert component in left.keys() and component in right.keys( ), "Error: {} not available in the dictionary.".format(component) assert left[component]["SESSION_ID"].isin( metadata["SESSION_ID"] ).all( ) == True, "Fetched data that was not requested for component {}.".format( component) assert right[component]["SESSION_ID"].isin( metadata["SESSION_ID"] ).all( ) == True, "Fetched data that was not requested for component {}.".format( component) filelist = [ "GRF_COP_AP_", "GRF_COP_ML_", "GRF_F_AP_", "GRF_F_ML_", "GRF_F_V_" ] for element in filelist: component_name = element[element.index("_") + 1:-1].lower() test_data = pd.read_csv(filepath + "/" + element + "RAW_right.csv") test_data = test_data[test_data["SESSION_ID"].isin( metadata["SESSION_ID"])].reset_index(drop=True) assert test_data.equals( right[component_name] ), "Data for {} component differs from original one.".format( component_name) # test sampling for raw data left_sampled1 = _sample(left, stepsize=2, raw=True) left_sampled2 = _sample(left, stepsize=3, raw=True) left_sampled3 = _sample(left, stepsize=10, raw=True) for component in left_sampled1.keys(): assert left_sampled1[component].shape[1] == ( int)(100 / 2 + 3), "Size after sampling not appropriate." assert left_sampled2[component].shape[1] == ( int)(100 / 3 + 3), "Size after sampling not appropriate." assert left_sampled3[component].shape[1] == ( int)(100 / 10 + 3), "Size after sampling not appropriate."
from DataFetcher import DataFetcher D = DataFetcher('PTC') a = D.get_data_train_in_clique(1) a = D.get_data_train_in_clique(1) a = D.get_data_train_in_clique(1)
from graphHashFunctions import GraphHash_Naive import numpy as np from config import FLAGS from DataFetcher import DataFetcher import pickle os.environ['CUDA_VISIBLE_DEVICES']='0' # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Load data data_fetcher = DataFetcher(FLAGS.dataset) # Some preprocessing # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32), 'features': tf.sparse_placeholder(tf.float32, shape=(None, data_fetcher.get_node_feature_dim())), 'labels': tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32), # helper variable for sparse dropout 'graph_sizes': tf.placeholder(tf.int32, shape=((1+FLAGS.k)*FLAGS.batchsize)), 'learning_rate': tf.placeholder(tf.float32, shape=()) } # Create model
config.gpu_options.allow_growth = True MinGED = 0 MaxGED = 11 MaxGraphNum = 4999900 if len(sys.argv) != 3: print('parameters are model_path, output_name') os._exit(0) model_path = str(sys.argv[1]) output_fname = str(sys.argv[2]) """ Load datafetcher and model""" print('restoring model...') data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True, max_graph_num=MaxGraphNum) node_feature_dim = data_fetcher.get_node_feature_dim() # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32), 'features': tf.sparse_placeholder(tf.float32, shape=(None, node_feature_dim)), # 'labels': tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)), 'dropout': tf.placeholder_with_default(0., shape=()), 'graph_sizes': tf.placeholder(tf.int32, shape=(FLAGS.ecd_batchsize)), # 'graph_sizes': tf.placeholder(tf.int32, shape=(FLAGS.batchsize*(1+FLAGS.k))), # 'generated_labels':tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.k)), 'thres':tf.placeholder(tf.float32, shape=(FLAGS.hash_code_len)) } thres = np.zeros(FLAGS.hash_code_len)
def test_average_trials_and_scale(self): fetcher = DataFetcher(filepath) metadata = fetcher._DataFetcher__fetch_metadata() metadata = metadata.sample(n=5) # test averaging left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False) left_avg = _average_trials(left) for component in left.keys(): assert left_avg[component].shape[ 0] == 5, "Averaging did not reduce data to unique SESSION_IDs." id = left_avg[component].iloc[0, :]["SESSION_ID"] left[component] = left[component][left[component]["SESSION_ID"] == id] left_avg[component] = left_avg[component][left_avg[component] ["SESSION_ID"] == id] assert left_avg[component].iloc[:, 2:].equals( left[component].iloc[:, 3:].mean(axis=0).to_frame().transpose( )), "Averaging did not produce the expected values." # test scaling for averaged data right_avg = _average_trials(_sample(right, stepsize=10, raw=False)) left_avg = _average_trials(_sample(left, stepsize=10, raw=False)) scaler = GRFScaler(featureRange=(0, 1)) _fit_scaler(scaler, (right_avg, left_avg)) right_scaled = _scale(scaler, right_avg) for component in right_scaled.keys(): assert right_scaled[component].shape == right_avg[ component].shape, "Shape does not match after scaling." data1 = right_avg[component].iloc[:, 2:] data2 = left_avg[component].iloc[:, 2:] dmin = min([data1.values.min(), data2.values.min()]) dmax = max([data1.values.max(), data2.values.max()]) data1 = data1.applymap(lambda x: (x - dmin) / (dmax - dmin)) assert np.allclose( right_scaled[component].iloc[:, 2:].values, data1.values, rtol=1e-4, atol=1e-8), "Scaling does not produce the expected result." assert right_scaled[component].iloc[:, :2].equals( right_avg[component].iloc[:, :2] ), "Scaling messes up the meta-information columns." # test scaling for non-averaged data scaler = GRFScaler(featureRange=(0, 1)) _fit_scaler(scaler, (right, left)) left_scaled = _scale(scaler, left) for component in left_scaled.keys(): assert left_scaled[component].shape == left[ component].shape, "Shape does not match after scaling." data1 = right[component].iloc[:, 3:] data2 = left[component].iloc[:, 3:] dmin = min([data1.values.min(), data2.values.min()]) dmax = max([data1.values.max(), data2.values.max()]) data2 = data2.applymap(lambda x: (x - dmin) / (dmax - dmin)) assert np.allclose( left_scaled[component].iloc[:, 3:].values, data2.values, rtol=1e-4, atol=1e-8), "Scaling does not produce the expected result." assert left_scaled[component].iloc[:, :3].equals( left[component].iloc[:, :3] ), "Scaling messes up the meta-information columns."
class TestLabelBot(unittest.TestCase): def setUp(self): self.df = DataFetcher() self.df.repo = "apache/incubator-mxnet" self.df.github_user = "******" self.df.github_oauth_token = "123" def tearDown(self): pass def test_cleanstr(self): new_string = self.df.cleanstr("a_b", "") self.assertEqual(new_string, "ab") def test_count_pages(self): with patch('DataFetcher.requests.get') as mocked_get: mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = [{ "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11925, "labels": [{ 'name': 'Doc' }], "state": "open", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }, { "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11924, "labels": [], "state": "closed", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }] page = self.df.count_pages('all') self.assertEqual(page, 1) def test_fetch_issues(self): with patch('DataFetcher.requests.get') as mocked_get: mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = { "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11925, "labels": [{ 'name': 'Feature' }], "state": "open", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", } data = self.df.fetch_issues([11925]) expected_data = [{ 'id': "11925", 'title': "issue's title", 'body': "issue's body" }] assert_frame_equal(data, pd.DataFrame(expected_data)) def test_data2json(self): with patch('DataFetcher.requests.get') as mocked_get: mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = [{ "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11925, "labels": [{ 'name': 'Feature' }], "state": "open", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }, { "body": "issue's body", "created_at": "2018-07-28T18:27:17Z", "comments": "0", "number": 11924, "labels": [], "state": "closed", "title": "issue's title", "html_url": "https://github.com/apache/incubator-mxnet/issues/11925", }] self.df.data2json('all', labels=["Feature"], other_labels=False) expected_data = [{ 'id': 11925, 'title': "issue's title", 'body': "issue's body", 'labels': 'Feature' }] self.assertEqual(expected_data, self.df.json_data)
model_name = "0211_All_"+FLAGS.dataset model_path = "SavedModel/"+model_name+".ckpt" saved_files_dir = "SavedModel" """ Set random seed """ random_seed = 123 np.random.seed(random_seed) tf.set_random_seed(random_seed) seed(random_seed) """ Create data fetcher """ #data_fetcher = DataFetcher(dataset = FLAGS.dataset, exact_ged = True) data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True) # wrap the data fetcher with tensorflow.dataset.prefetch to accelerate training dataset = tf.data.Dataset.from_generator(data_fetcher.get_train_data, (tf.int64, tf.float32, tf.int64, tf.int64, tf.float32, tf.int64, tf.int32, tf.float32, tf.float32), (tf.TensorShape([None,2]),#feature, sparse index tf.TensorShape([None]), # feature sparse value tf.TensorShape([2]), # feature shape tf.TensorShape([None,2]),# laplacian sparse index tf.TensorShape([None]), #laplacian sparse value tf.TensorShape([2]), # laplacian sparse shape tf.TensorShape([(1+FLAGS.k)*FLAGS.batchsize]), #shape #tf.TensorShape([None]), # shape tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]),#label tf.TensorShape([FLAGS.batchsize, FLAGS.k]),# gen_label
class AlgoManager(): ''' attribute self.StudyIDs ''' def __init__(self, studyID, use_ssh=True): #create a datafetcher instance to fetch the data from the database self.data_fetcher = DataFetcher(use_ssh) self.n_bins = 10 self.queryStudyID = studyID def get_contours_by_id(self, roi_index): return self.data_fetcher.get_contours_by_id(self.queryStudyID, roi_index) def feature_extraction(self): ''' call ovh, sts and td to get the ovh sts and td features :param StudyID: :return ovh: a histogram of ovh feature :return sts: a histogram of sts feature :return td: target dose ''' #Both PTV and OAR are dictionary PTV,OAR = self.data_fetcher.get_contours(self.queryStudyID) # Check that PTV has been found assert len(PTV.keys()) > 0 , "PTV NOT FOUND" row_spacing, column_spacing, slice_thickness = self.data_fetcher.get_spacing(self.queryStudyID) pixel_spacing = self.data_fetcher.get_pixel_spacing(self.queryStudyID) for ptv_name,ptv_tuple in PTV.items(): for oar_name,oar_tuple in OAR.items(): #in the tuple, the first one is contour block and the second one is roi block print("process the pair") oar_contour_block = oar_tuple[0] oar_roi_block = oar_tuple[1] ptv_contour_block = ptv_tuple[0] ptv_roi_block = ptv_tuple[1] bin_vals, bin_amts = getOVH(oar_roi_block, ptv_contour_block, ptv_roi_block, pixel_spacing, row_spacing, column_spacing, slice_thickness, self.n_bins) ovh_hist = (bin_vals, bin_amts) # print("Get ovh {}".format(ovh_hist)) print("OVH Done") elevation_bins, distance_bins, azimuth_bins, amounts = getSTSHistogram(ptv_roi_block, oar_roi_block, self.n_bins) sts_hist = (elevation_bins, distance_bins, azimuth_bins, amounts) print("STS Done") # print("Get Sts {}".format(sts_hist)) self.data_fetcher.save_ovh(ptv_name,oar_name,ovh_hist,self.queryStudyID) self.data_fetcher.save_sts(ptv_name,oar_name,sts_hist,self.queryStudyID) print("Saved OVH and STS") pass def generate_pairs(self,queryStudy,dbStudy): ''' match the queryStudy with dbStudy to generate pairs :param queryStudy: a dictionary, key is the name of OAR, the value is the histogram :param dbStudy: a dictionary, key is the name of OAR, the value is the histogram :return: { oar_id: (hist_query,hist_db) } ''' queryKeys = set(queryStudy.keys()) dbKeys = set(dbStudy.keys()) mergedKeys = queryKeys.intersection(dbKeys) mergedDict = defaultdict() for key in mergedKeys: query_tuple = [] for block in queryStudy[key]: # process amounts (2d array) separately if "]" in block: query_values = block.replace("]", " ").replace("[", " ").replace(",", " ").split(" ") query_array = np.zeros(shape=((self.n_bins ** 3 * 4)), dtype=np.float64) count = 0 for i, val in enumerate(query_values): if val: query_array[count] = float(val.strip()) count +=1 query_array = query_array.reshape((self.n_bins ** 3, 4)) if count != self.n_bins ** 3 * 4: import pdb ; pdb.set_trace() assert count == self.n_bins ** 3 * 4, "invalid parsed STS values" else: query_values = block.split(",") query_array = np.zeros(shape=(len(query_values)), dtype=np.float64) for i, val in enumerate(query_values): query_array[i] = float(val) query_tuple.append(query_array) historical_tuple = [] for block in dbStudy[key]: # process amounts (2d array) separately if "]" in block: query_values = block.replace("]", " ").replace("[", " ").replace(",", " ").split(" ") query_array = np.zeros(shape=((self.n_bins ** 3 * 4)), dtype=np.float64) count = 0 for i, val in enumerate(query_values): if val: query_array[count] = float(val.strip()) count +=1 query_array = query_array.reshape((self.n_bins ** 3, 4)) assert count == self.n_bins ** 3 * 4, "invalid parsed STS values" else: query_values = block.split(",") query_array = np.zeros(shape=(len(query_values)), dtype=np.float64) for i, val in enumerate(query_values): query_array[i] = float(val) historical_tuple.append(query_array) mergedDict[key] = (query_tuple, historical_tuple) return mergedDict def similarity_calculation(self): ''' fetch ovh and STS features of other study calculate dissimilarity between features calculate similarity between study pair :return: dict with dissimiarity and similarity ''' queryOVH = self.data_fetcher.get_ovh(self.queryStudyID) querySTS = self.data_fetcher.get_sts(self.queryStudyID) self.DBStudy_list = self.data_fetcher.get_dbstudy_list(self.queryStudyID) for studyID in self.DBStudy_list: historical_id = studyID["id"] dbOVH = self.data_fetcher.get_ovh(str(historical_id)) ovh_pairs = self.generate_pairs(queryOVH,dbOVH) dbSTS = self.data_fetcher.get_sts(str(historical_id)) sts_pairs = self.generate_pairs(querySTS,dbSTS) keys = ovh_pairs.keys() if len(keys) > 0: print("Processing similar pairs") for key in keys: ovh_item = ovh_pairs[key] ovh_dis = getOVHEmd(ovh_item[0][0],ovh_item[0][1],ovh_item[1][0],ovh_item[1][1]) sts_item = sts_pairs[key] sts_dis = getSTSEmd(sts_item[0][3], sts_item[1][3]) # Get PTV target dose query_target_dose = self.data_fetcher.get_target_dose(self.queryStudyID, int(key.split(" ")[1])) historical_target_dose = self.data_fetcher.get_target_dose(historical_id, int(key.split(" ")[1])) self.data_fetcher.save_similarity(str(historical_id), query_target_dose - historical_target_dose, str(ovh_dis), str(sts_dis), key.split(" ")[0], key.split(" ")[-1], str(historical_id), self.queryStudyID) #The entrance of the program def run(self): #extract OVH and STS for new case #store the OVH and STS #fetch OVH and STS of other cases #Do the similarity calculation #Save the result to database #Store the StudyID of all DB studies for future similarity calculation self.DBStudy_list = self.data_fetcher.get_dbstudy_list(self.queryStudyID) #calculate ovh,sts and save it to database self.feature_extraction() self.similarity_calculation()
config.gpu_options.allow_growth = True test_top_k = True test_range_query = True # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) train = True #chkp.print_tensors_in_checkpoint_file("SavedModel/model_rank.ckpt", tensor_name='', all_tensors=True, all_tensor_names=True) # Load data data_fetcher = DataFetcher(FLAGS.dataset, True) dataset = tf.data.Dataset.from_generator( data_fetcher.get_train_data, (tf.int64, tf.float32, tf.int64, tf.int64, tf.float32, tf.int64, tf.int32, tf.float32, tf.float32), ( tf.TensorShape([None, 2]), tf.TensorShape([None]), tf.TensorShape([2]), tf.TensorShape([None, 2]), tf.TensorShape([None]), tf.TensorShape([2]), tf.TensorShape([(1 + FLAGS.k) * FLAGS.batchsize]), tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]), tf.TensorShape([FLAGS.batchsize, FLAGS.k]), )) dataset = dataset.prefetch(buffer_size=1)
from utils import get_similar_graphs_gid, get_top_k_similar_graphs_gid from graphHashFunctions import GraphHash_Rank_Reg import numpy as np from config import FLAGS from DataFetcher import DataFetcher import pickle os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Load data data_fetcher = DataFetcher(FLAGS.dataset) # Some preprocessing # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32), 'features': tf.sparse_placeholder(tf.float32, shape=(None, data_fetcher.get_node_feature_dim())), 'labels': tf.placeholder(tf.float32, shape=(FLAGS.batchsize, FLAGS.batchsize)), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero':
def setUp(self): self.df = DataFetcher() self.df.repo = "apache/incubator-mxnet" self.df.github_user = "******" self.df.github_oauth_token = "123"
def test_arrange_and_format(self): fetcher = DataFetcher(filepath) metadata = fetcher._DataFetcher__fetch_metadata() metadata = metadata.sample(n=5) left, right = fetcher._DataFetcher__fetch_data(metadata, raw=False) affected, non_affected = _arrange_data(left, right, metadata, 1) leftSideAffected = metadata[metadata["AFFECTED_SIDE"] == 0] rightSideAffected = metadata[metadata["AFFECTED_SIDE"] == 1] for component in affected.keys(): assert affected[component][affected[component]["SESSION_ID"].isin( leftSideAffected["SESSION_ID"])].iloc[:, :100].equals( left[component][left[component]["SESSION_ID"].isin( leftSideAffected["SESSION_ID"])].iloc[:, :100] ), "Did not assign affected leg correctly (left)." assert non_affected[component][ non_affected[component]["SESSION_ID"].isin( rightSideAffected["SESSION_ID"])].iloc[:, :100].equals( left[component][left[component]["SESSION_ID"].isin( rightSideAffected["SESSION_ID"])].iloc[:, :100] ), "Did not assign unaffected leg correctly (left)." assert not ( affected[component].iloc[:, :100] == non_affected[component].iloc[:, :100]).all(axis=1).any( ), "Affected and unaffected side have the same data." assert affected[component][affected[component]["SESSION_ID"].isin( rightSideAffected["SESSION_ID"])].iloc[:, :100].equals( right[component][right[component]["SESSION_ID"].isin( rightSideAffected["SESSION_ID"])].iloc[:, :100] ), "Did not assign affected leg correctly (right)." assert non_affected[component][ non_affected[component]["SESSION_ID"].isin( leftSideAffected["SESSION_ID"])].iloc[:, :100].equals( right[component][right[component]["SESSION_ID"].isin( leftSideAffected["SESSION_ID"])].iloc[:, :100] ), "Did not assign unaffected leg correctly (right)." data = fetcher._DataFetcher__split_and_format(affected, non_affected) component = list(affected.keys())[0] assert np.equal( data["label"], affected[component]["CLASS_LABEL"].map({ "HC": 0, "H": 1, "K": 2, "A": 3, "C": 4 }).values).all(), "Class labels do not match." assert data["affected"].shape == ( affected[component].shape[0], 101, 5), "Output shape incorrect (affected)." assert data["non_affected"].shape == ( affected[component].shape[0], 101, 5), "Output shape incorrect (non_affected)." comp_list = list(affected.keys()) for i in range(5): assert np.equal(data["affected"][:, :, i], np.asarray(affected[comp_list[i]].iloc[:, 3:104]), dtype=np.float32).all( ), "Data for {} component does not match".format( comp_list[i])
""" train the model or load existing model """ train = True model_name = "0902:code_emb_ours_" + FLAGS.dataset model_path = "SavedModel/" + model_name + ".ckpt" saved_files_dir = "SavedModel" """ Set random seed """ random_seed = 123 np.random.seed(random_seed) tf.set_random_seed(random_seed) seed(random_seed) """ Create data fetcher """ if use_csm: assert (FLAGS.dataset == CSM_FLAGS.csm_dataset) #data_fetcher = DataFetcher(dataset = FLAGS.dataset, exact_ged = True) data_fetcher = DataFetcher(dataset=FLAGS.dataset, exact_ged=True) # wrap the data fetcher with tensorflow.dataset.prefetch to accelerate training dataset = tf.data.Dataset.from_generator( data_fetcher.get_train_data, (tf.int64, tf.float32, tf.int64, tf.int64, tf.float32, tf.int64, tf.int32, tf.float32, tf.float32), ( tf.TensorShape([None, 2]), #feature, sparse index tf.TensorShape([None]), # feature sparse value tf.TensorShape([2]), # feature shape tf.TensorShape([None, 2]), # laplacian sparse index tf.TensorShape([None]), #laplacian sparse value tf.TensorShape([2]), # laplacian sparse shape tf.TensorShape([(1 + FLAGS.k) * FLAGS.batchsize]), #shape #tf.TensorShape([None]), # shape tf.TensorShape([FLAGS.batchsize, FLAGS.batchsize]), #label