def main(): """ creates and loads table1, 2, 3 """ session, cluster = set_keyspace() extract_data() df = pd.read_csv('processed_data.csv') # create tables for query in create_table_queries: create_table(session, query) # loading data to the tables for row in df.values: table1_data = (row[i] for i in [0, 9, 5, 8, 3]) table2_data = (row[0], row[9], row[8], row[3], row[1]+' '+row[4], row[10]) table3_data = (row[9], row[1]+' '+row[4], row[10]) # load all of them load_table(session, table1_insert_query, table1_data) load_table(session, table2_insert_query, table2_data) load_table(session, table3_insert_query, table3_data) test(session) # drop_tables(session) session.shutdown() cluster.shutdown()
def test_extract_data(self): root = join('testdata', 'test_extract_data') for fname in os.listdir(root): if splitext(fname)[1] != '.ess': continue with open(join(root, fname) + '.json') as f: cdata = json.loads(f.read()) plugininfo, fidarray, playerdata, playerflags, required_plugins\ = extract.extract_data(join(root, fname)) self.assertEqual(cdata['plugininfo'], plugininfo) self.assertEqual(cdata['fidarray'], fidarray) self.assertEqual(sorted(cdata['required plugins']), sorted(required_plugins)) self.assertEqual(cdata['flags'], playerflags) self.assertEqual(cdata['hair color'], playerdata['hair color']) self.assertEqual(cdata['skin color'], playerdata['skin color']) self.assertEqual(cdata['head texture'], playerdata['head texture']) self.assertEqual(cdata['headparts'], playerdata['headparts']) self.assertEqual(cdata['unknown1'], playerdata['unknown1']) self.assertEqual(cdata['face morph values'], playerdata['face morph values']) self.assertEqual(cdata['faceparts'], playerdata['faceparts']) if 'female' in cdata: self.assertIn('female', playerdata) self.assertEqual(cdata['female'], playerdata['female']) if 'race' in cdata: self.assertIn('race', playerdata) self.assertEqual(cdata['race'], playerdata['race'])
def extract_random_segments_for_given_patient_during_warning(self,segment_label, patient_no): #during warning related to AR(l)__autocorrelation lag current_patient_ = patient_list[patient_no] patient_ann_ = current_patient_[:-4] + '-nsrr.xml' ann_, onset_, duration_ = extract_anns(TEST_ANN_PATH + patient_ann_) eeg_dict_, info_dict_ = extract_data(TEST_DATA_PATH + current_patient_, ann_, onset_) return eeg_dict_[segment_label][np.random.choice(len(eeg_dict[segment_label]))]
def extract_test_segments_for_given_patient(self, patient_no): #helper current_patient = patient_list[patient_no] patient_ann = current_patient[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(TEST_ANN_PATH + patient_ann) eeg_dict, info_dict = extract_data(TEST_DATA_PATH + current_patient, ann, onset,duration[-1]) len_dict = {} for i in eeg_dict.keys(): len_dict[i] = len(eeg_dict[i]) #print(f"eeg_dict{i} is {eeg_dict[i]}") #random.shuffle(eeg_dict[i]) #print(len_dict) selected_tuples=[] for _ in range(excess_segments_needed_per_patient): for i in eeg_dict.keys(): if i==4: continue if len_dict[i]!=0: selected_tuples.append((int(i),eeg_dict[i][np.random.choice(len(eeg_dict[i]))])) print(f"RAM: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024} MB") for t in selected_tuples: yield t
def get_ref_dicts(self): #indices of fit dicts, found beforehand, can use FitDictFinder class to get desired dicts fit_ref_list = [3, 10, 19, 21, 22, 26, 28, 33, 36, 38, 40, 42, 43, 51, 52, 53, 55, 56, 57, 60, 62,\ 64, 70, 72, 73, 82, 83, 85, 87, 89, 90, 93, 95, 101, 103, 107, 109, 113, 115, 116,\ 117, 123, 130, 131, 135, 138, 139, 141, 148, 155, 156, 157, 159] selected_ref_indices = list( np.random.choice(fit_ref_list, size=self.num_patients, replace=False)) for i in selected_ref_indices: ref = patient_list[i] ann_ref = ref[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(TRAIN_ANN_PATH + ann_ref) preprocess = 'std' eeg_dict = extract_data(TRAIN_DATA_PATH + ref, ann, onset, duration[-1], preprocess=preprocess) len_dict = {} for i in eeg_dict.keys(): len_dict[i] = len(eeg_dict[i]) print(len_dict) print(ref) print(ann_ref) yield eeg_dict
def test_extract_data(self): root = join('testdata', 'test_extract_data') for fname in os.listdir(root): if splitext(fname)[1] != '.ess': continue with open(join(root, fname)+'.json') as f: cdata = json.loads(f.read()) plugininfo, fidarray, playerdata, playerflags, required_plugins\ = extract.extract_data(join(root, fname)) self.assertEqual(cdata['plugininfo'], plugininfo) self.assertEqual(cdata['fidarray'], fidarray) self.assertEqual(sorted(cdata['required plugins']), sorted(required_plugins)) self.assertEqual(cdata['flags'], playerflags) self.assertEqual(cdata['hair color'], playerdata['hair color']) self.assertEqual(cdata['skin color'], playerdata['skin color']) self.assertEqual(cdata['head texture'], playerdata['head texture']) self.assertEqual(cdata['headparts'], playerdata['headparts']) self.assertEqual(cdata['unknown1'], playerdata['unknown1']) self.assertEqual(cdata['face morph values'], playerdata['face morph values']) self.assertEqual(cdata['faceparts'], playerdata['faceparts']) if 'female' in cdata: self.assertIn('female', playerdata) self.assertEqual(cdata['female'], playerdata['female']) if 'race' in cdata: self.assertIn('race', playerdata) self.assertEqual(cdata['race'], playerdata['race'])
def extract_random_segments_for_given_patient_during_warning( self, segment_label, patient_no): #during warning related to AR(l)__autocorrelation lag current_patient_ = self.patient_list[patient_no] print(f'\nCurrent Patient: {current_patient_}') patient_ann_ = current_patient_[:-4] + '-nsrr.xml' ann_, onset_, duration_ = extract_anns(self.ann_path + patient_ann_) eeg_dict_, info_dict_ = extract_data(self.data_path + current_patient_, ann_, onset_, duration_[-1]) #print(np.random.choice(len(eeg_dict_[segment_label])-1), len(eeg_dict_[segment_label])) return (int(segment_label), eeg_dict_[segment_label][np.random.choice( len(eeg_dict_[segment_label]) - 1)])
def extract_random_segments_for_given_patient( self, patient_no, num_segs_chosen_per_patient): #helper current_patient = self.patient_list[patient_no] print(f'\nCurrent Patient: {current_patient}') patient_ann = current_patient[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(self.ann_path + patient_ann) eeg_dict, info_dict = extract_data(self.data_path + current_patient, ann, onset, duration[-1]) len_dict = {} for i in eeg_dict.keys(): len_dict[i] = len(eeg_dict[i]) print(len_dict) tuples = [] #all (label, segment) for label in eeg_dict.keys(): # flag = (label == 0 or label == 2) for seg in range(len_dict[label]): tuples.append((int(label), eeg_dict[label][seg])) # if flag and np.random.random() < 0.3: # tuples.append((int(label), eeg_dict[label][seg])) # if not flag: # tuples.append((int(label), eeg_dict[label][seg])) # l = [] # for t in tuples: # l.append(t[0]) # print(f"tuples: {np.unique(l, return_counts=True)}") random.shuffle(tuples) selected_tuples = [] for i in range(num_segs_chosen_per_patient): selected_tuples.append(tuples.pop()) # t = tuples.pop() # if t[0] == self.ref_label: # selected_tuples.append(t) #popping after shuffling equivalent to sampling randomly # if t[0] != self.ref_label and np.random.random()<0.2: # selected_tuples.append(t) del tuples # l = [] # for t in selected_tuples: # l.append(t[0]) # print(f"selected_tuples: {np.unique(l, return_counts=True)}") for t in selected_tuples: yield t
def __init__(self): data, target = extract_data() # splits the data into training and testing X_train, X_test, Y_train, Y_test = \ train_test_split(data, target, test_size=0.3) clf = RandomForestClassifier(n_estimators=500) clf.fit(X_train, Y_train) y_pred = clf.predict(X_test) print("Accuracy: ", metrics.accuracy_score(Y_test, y_pred))
def write_final(dirname, work, final, extract_methods): df = extract_data(work) if 'csv' in extract_methods: csv = os.path.join(final, dirname + ".csv") df.to_csv(csv, index=False, header=True) print "\tSUCCESS: Extracted data from .out file. CSV written to ./final/%s.csv" % dirname if 'sqlite3' in extract_methods: db_path = os.path.join(final, "data.db") conn = sqlite3.connect( db_path, timeout=10) # 10 seconds to avoid write deadlock? try: sqlio.write_frame(df, name='trees_fvsaggregate', con=conn, flavor='sqlite', if_exists='append') except sqlite3.IntegrityError as e: if e.message.endswith("are not unique"): # try to drop and rerun cursor = conn.cursor() delete_sql = """DELETE FROM trees_fvsaggregate WHERE var = '%(var)s' AND rx = %(rx)d AND cond = %(cond)d AND site = %(site)d AND climate = '%(climate)s' """ % df.irow(0) # assume the dataframe has the same data res = cursor.execute(delete_sql) if res.rowcount > 0: print "\tNOTICE : Deleting %d old rows from ./final/data.db" % res.rowcount # try again sqlio.write_frame(df, name='trees_fvsaggregate', con=conn, flavor='sqlite', if_exists='append') else: # something else went wrong conn.rollback() raise sqlite3.IntegrityError(e.message) conn.commit() conn.close() print "\tSUCCESS: Extracted data from .out file. Row appended to ./final/data.db"
def __init__(self, depth=10, min_leaf=10, num_trees=10): data, target = extract_data() # splits the data into training and testing X_train, X_test, Y_train, Y_test = \ train_test_split(data, target, test_size=0.3) self.max_depth = depth self.min_leaf = min_leaf self.n_features = int(sqrt(len(data[0]) - 1)) self.num_trees = num_trees self.combine_lists(X_train, Y_train) self.combine_lists(X_test, Y_test) self.generate_random_forest(X_train, Y_train, X_test, Y_test)
def scrape_world_select(): dt = datetime.now() response = get_osrs_world_select() status_code = response.status_code if (response.ok): world_data, total_player_data = extract_data(response) world_data, total_player_count = (transform_data( world_data, total_player_data, dt)) load_data(world_data, total_player_count) else: print('Bad Response - HTTP', status_code) update_logs(dt, status_code)
def specific_patient_dataset(patient_no): current_patient = patient_list[patient_no] patient_ann = current_patient[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(TRAIN_ANN_PATH + patient_ann) eeg_dict, info_dict = extract_data(TRAIN_DATA_PATH + current_patient, ann, onset) len_dict = {} eeg_dict_list = [] for i in eeg_dict.keys(): len_dict[i] = len(eeg_dict[i]) if i == 4 and len_dict[i] != 0: print( f"{len_dict[i]} keys of segment 4 in patient no: {patient_no}") random.shuffle(eeg_dict[i]) eeg_dict_list.append(eeg_dict) return eeg_dict_list, len_dict
def extract_random_segments_for_given_patient(self, patient_no, num_segs_chosen): #helper current_patient = patient_list[patient_no] patient_ann = current_patient[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(TRAIN_ANN_PATH + patient_ann) eeg_dict, info_dict = extract_data(TRAIN_DATA_PATH + current_patient, ann, onset, duration[-1]) len_dict = {} labels_available = [] for i in eeg_dict.keys(): len_dict[i] = len(eeg_dict[i]) if len_dict[i] != 0: labels_available.append(i) #print(f"eeg_dict{i} is {eeg_dict[i]}") #random.shuffle(eeg_dict[i]) #print(len_dict) tuples = [] #all (label, segment) for label in eeg_dict.keys(): for seg in range(len_dict[label]): tuples.append((int(label), eeg_dict[label][seg])) random.shuffle(tuples) selected_tuples = [] for i in range(num_segs_chosen): selected_tuples.append(tuples.pop( )) #popping after shuffling equivalent to sampling randomly print( f"{i}th segment chosen of label {selected_tuples[i][0]} from patient {patient_no}" ) #print(f"RAM: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024} MB") del tuples print( f"RAM: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024} MB" ) process = psutil.Process(os.getpid()) print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), \ " | Proc size: " + humanize.naturalsize( process.memory_info().rss)) for t in selected_tuples: yield t
def get_fit_dict_indices(self): fit_dict_indices = [] for i in range(len(patient_list)): current_patient = patient_list[i] patient_ann = current_patient[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(TRAIN_ANN_PATH + patient_ann) preprocess = 'std' eeg_dict, info_dict = extract_data(TRAIN_DATA_PATH + current_patient, ann, onset, duration[-1], preprocess=preprocess) flag = self.is_dict_fit(eeg_dict) print(i, flag) if flag: fit_dict_indices.append(i) print(fit_dict_indices)
def buildDB(): # clean the data file at private/filename noisy_file = os.path.join(request.folder, 'private','027__BTECH__6TH SEM.txt') clean_file = os.path.join(request.folder, 'private','cleanFile.txt') preprocess.cleanData(noisy_file, clean_file) # extract college information lines = preprocess.readFile(clean_file) colleges = extract.extract_data(lines) #insert into table colleges for name in colleges.keys(): db.colleges.insert(name=name) # insert into table students for name in colleges.keys(): for student in colleges[name].students: credits = (4,4,4,4,4,4,1,1,1,1,1) total_credits = 0 for i in xrange(len(credits)): total_credits = total_credits + credits[i] percentage = (credits[0]*int(student.marks[0]) + credits[1]*int(student.marks[1]) + credits[2]*int(student.marks[2]) + credits[3]*int(student.marks[3]) + credits[4]*int(student.marks[4]) + credits[5]*int(student.marks[5]) + credits[6]*int(student.marks[6]) + credits[7]*int(student.marks[7]) + credits[8]*int(student.marks[8]) + credits[9]*int(student.marks[9]) + credits[10]*int(student.marks[10])) percentage = (percentage * 1.0) / total_credits if percentage < 50.0: continue collegeid = db(db.colleges.name == name).select()[0]['id'] db.students.insert(colleges_id = collegeid, rollNo = student.rollNo[1:], name = student.name, subj1 = int(student.marks[0]), subj2 = int(student.marks[1]), subj3 = int(student.marks[2]), subj4 = int(student.marks[3]), subj5 = int(student.marks[4]), subj6 = int(student.marks[5]), subj7 = int(student.marks[6]), subj8 = int(student.marks[7]), subj9 = int(student.marks[8]), subj10 = int(student.marks[9]), subj11 = int(student.marks[10]), percentage = percentage) return 'db built'
def write_final(dirname, work, final, extract_methods): df = extract_data(work) if 'csv' in extract_methods: csv = os.path.join(final, dirname + ".csv") df.to_csv(csv, index=False, header=True) print "\tSUCCESS: Extracted data from .out file. CSV written to ./final/%s.csv" % dirname if 'sqlite3' in extract_methods: db_path = os.path.join(final, "data.db") conn = sqlite3.connect(db_path, timeout=10) # 10 seconds to avoid write deadlock? try: sqlio.write_frame(df, name='trees_fvsaggregate', con=conn, flavor='sqlite', if_exists='append') except sqlite3.IntegrityError as e: if e.message.endswith("are not unique"): # try to drop and rerun cursor = conn.cursor() delete_sql = """DELETE FROM trees_fvsaggregate WHERE var = '%(var)s' AND rx = %(rx)d AND cond = %(cond)d AND site = %(site)d AND climate = '%(climate)s' """ % df.irow(0) # assume the dataframe has the same data res = cursor.execute(delete_sql) if res.rowcount > 0: print "\tNOTICE : Deleting %d old rows from ./final/data.db" % res.rowcount # try again sqlio.write_frame(df, name='trees_fvsaggregate', con=conn, flavor='sqlite', if_exists='append') else: # something else went wrong conn.rollback() raise sqlite3.IntegrityError(e.message) conn.commit() conn.close() print "\tSUCCESS: Extracted data from .out file. Row appended to ./final/data.db"
def extract_random_segments_for_given_patient_during_warning( self, segment_label, patient_no): #during warning related to AR(l)__autocorrelation lag current_patient_ = patient_list[patient_no] patient_ann_ = current_patient_[:-4] + '-nsrr.xml' ann_, onset_, duration_ = extract_anns(TRAIN_ANN_PATH + patient_ann_) eeg_dict_, info_dict_ = extract_data( TRAIN_DATA_PATH + current_patient_, ann_, onset_, duration_[-1]) len_dict = {} labels_available = [] for i in eeg_dict_.keys(): len_dict[i] = len(eeg_dict_[i]) if len_dict[i] != 0: labels_available.append(i) segment_label = np.random.choice(labels_available) return (int(segment_label), eeg_dict_[segment_label][(len(eeg_dict_[segment_label]) - 1) // 2])
def extract_test_segments_for_given_patient(self, patient_no): #helper current_patient = patient_list[patient_no] patient_ann = current_patient[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(TEST_ANN_PATH + patient_ann) preprocess = None #no-preprocessing eeg_dict, stat = extract_data(TEST_DATA_PATH + current_patient, ann, onset, duration[-1], preprocess=preprocess, return_stats=True) len_dict = {} for i in eeg_dict.keys(): len_dict[i] = len(eeg_dict[i]) selected_tuples = [] for i in eeg_dict.keys(): if len_dict[i] != 0: #print(f"Label: {i}: {len_dict[i]}") if i == 1: seg_indices = np.random.choice(len(eeg_dict[i]), min(11, len(eeg_dict[i])), replace=False) for j in seg_indices: selected_tuples.append((int(i), eeg_dict[i][j])) else: seg_indices = np.random.choice(len(eeg_dict[i]), min(10, len(eeg_dict[i])), replace=False) for j in seg_indices: selected_tuples.append((int(i), eeg_dict[i][j])) #print(seg_indices) for t in selected_tuples: yield t, stat
def extract_random_segments_for_given_patient(self, patient_no, num_segs_chosen_per_patient): #helper current_patient = self.patient_list[patient_no] patient_ann = current_patient[:-4] + '-nsrr.xml' ann, onset, duration = extract_anns(self.ann_path + patient_ann) preprocess = None #getting un-preprocessed segments eeg_dict, stat = extract_data(self.data_path + current_patient, ann, onset, duration[-1], preprocess=preprocess, return_stats=True) self.stats.append(stat) len_dict = {} for i in eeg_dict.keys(): len_dict[i] = len(eeg_dict[i]) selected_tuples = [] labels = [] tuples = [] #all (label, segment) for label in [1]: for seg in range(len_dict[label]): selected_tuples.append((int(label), eeg_dict[label][seg])) labels.append(label) for label in [0,2,3,4]: for seg in range(len_dict[label]): tuples.append((int(label), eeg_dict[label][seg])) random.shuffle(tuples) for _ in range(num_segs_chosen_per_patient-len(selected_tuples)): t = tuples.pop() selected_tuples.append(t) labels.append(t[0]) del tuples self.segs_global.extend(labels) self.data_list.extend(selected_tuples) del selected_tuples
def main(): result = False time = sys.argv[1] for key, value in LOAD_SURVEY_TYPES.items(): result = load_data(value, time) if not result: print("\nExecution of Data Load of " + value + " Failed") break if result: ("\nExecution of Data Load Successful") result = procedure_execute(time) if result: print("\nExecution of Stored Procedures Successful") result = extract_data(time) if result: print("\nExecution of Data Extract Successful") print("\nExecution for " + time + " is Completed") elif not result: print("\nExecution of Data Extract Failed") elif not result: print("\nExecution of Stored Procedures Failed")
# If the densenet features were pre-computed, we don't have to recompute them load_densenet_features = os.path.isfile(densenet_features_path) load_frames = os.path.isdir(frames_path) ## EXTRACTION ## # If not already done, we extract the relevant frames from the raw MUG dataset if not load_frames: extract_frames(subjects_path, frames_path) # Now we extract the training and target data from the frames if not load_densenet_features: x, y = extract_data(frames_path) else: with open(y_data_path, 'rb') as f: y = pickle.load(f) if load_densenet_features: with open(densenet_features_path, 'rb') as f: densenet_features = pickle.load(f) else: # Create DenseNet model densenet = DenseNet121(include_top=False, input_shape=(img_size, img_size, nb_channels)) densenet = Model(inputs=densenet.input, output=GlobalAveragePooling2D(name='avg_pool')(densenet.output)) # Extract the DenseNet features densenet_features = extract_model_features(densenet, x)