def getData(self): file_names, params = self.gen_query(self.lat, self.lon, self.date) file_names.reverse() params.reverse() # print("F: " ,file_names) weather = [] for file_name, param in zip(file_names, params): api_call = self.api_format + param resp = requests.get(api_call) d = DataExtractor(self.oID, resp.content) # print(d.maxPrecip(), ":", resp.content) # print(d.maxPrecip(), end = ' ') weather.append(str(d.maxPrecip())) # weather.extend([d.maxPrecip(), d.maxTemp(), d.maxAirPressure(), d.maxHumidity(), d.maxWind()]) # print() return weather
def extract_class_data(self, raw_data): extracted_data = [] for a_class in raw_data: a_class_data = DataExtractor(a_class) if a_class_data.class_name is not None: extracted_data.append(a_class_data) return extracted_data
def decode(self, img, first_strip=False): softstrip_matrix = SoftstripMatrix(img, self.gray_img) header_extractor = HeaderExtractor(softstrip_matrix) header_extractor.parse_header() vertical_sync_start = header_extractor.vertical_sync_start self.bits_count = header_extractor.get_bits_per_row() if self.config['row_extractor'] == CNN_ROW_EXTRACTOR: row_extractor = CnnRowExtractor(softstrip_matrix.grayscale_matrix, softstrip_matrix.binary_matrix, self.bits_count) gray_grouped_matrix, grouped_matrix = row_extractor.extract_rows() else: row_extractor = AlgorithmicRowExtractor(softstrip_matrix, self.bits_count) grouped_matrix, gray_grouped_matrix = row_extractor.extract_rows() if self.config['row_decoder'] == CNN_ROW_DECODER: row_decoder = CnnRowDecoder(gray_grouped_matrix, self.start_time, self.bits_count, self.config['timeout'], vertical_sync_start) reduced_pixel_matrix = row_decoder.decode_rows() else: row_decoder = AlgorithmicRowDecoder(grouped_matrix, self.bits_count, self.start_time, self.config['timeout']) reduced_pixel_matrix = row_decoder.decode_rows() if len(reduced_pixel_matrix) == 0: print('[ERROR] ' + self.path + ' is invalid!') else: data_extractor = DataExtractor(self.config['timeout']) data_extractor.extract_data(reduced_pixel_matrix, first_strip, self.start_time) self.data += data_extractor.data if data_extractor.valid: print('Checksum valid!') if first_strip: self.strip_meta_info = data_extractor.file_header print(self.strip_meta_info) else: print('Checksum invalid!')
aws_client = AWSClient(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) #Constants for Data Extraction LOGIN_URL = 'https://www.lendingclub.com/auth/login' DOWNLOAD_URL = 'https://www.lendingclub.com/info/download-data.action' DIR_PATH = 'Data\DOWNLOAD_LOAN_DATA' #Parameters for lending club data scraping fileTag = "loanStatsFileNamesJS" # email= os.environ['LENDING_CLUB_EMAIL'] # password= os.environ['LENDING_CLUB_PASSWORD'] print('Downloading Files...') #Extract Data from Lending Club URL de = DataExtractor(email, password) de.extractData(LOGIN_URL=LOGIN_URL, DOWNLOAD_URL=DOWNLOAD_URL, fileTag=fileTag) print('Ingesting Data...') #Ingest Data into Pipeline di = DataIngestor(aws_client) #Create Landing and Processed Buckets LANDING_BUCKET = 'lending-club-landing-data' PROCESSED_BUCKET = 'lending-club-processed-data' print('Creating Buckets...') di.createS3Bucket(LANDING_BUCKET) di.createS3Bucket(PROCESSED_BUCKET)
def get_arguments(argument_list): short_options = "d:" long_options = ["document="] try: document_file = '' arguments, values = getopt.getopt(argument_list, short_options, long_options) print(arguments) if len(arguments) < 1: print("Invalid arguments") sys.exit(2) for t in arguments: if t[0] in ("-d", "--document"): document_file = t[1] print(document_file) return document_file except getopt.error as err: print(str(err)) sys.exit(2) if __name__ == '__main__': wiki_10_file = get_arguments(sys.argv[1:]) inverted_index = InvertedIndex() data_extractor = DataExtractor() structure_file_name = data_extractor.extract_data(wiki_10_file) index_file, vector_file = inverted_index.build_term_index( structure_file_name, wiki_10_file) print("Index file name: ", index_file) print("Vector file name: ", vector_file)
from DataExtractor import DataExtractor from TextProcessing import TextProcessing from TextClassifier import TextClassifier import pandas as pd train, target = DataExtractor('data.csv').data_producer('train') test = DataExtractor('test.csv').data_producer('test') text_classifier = TextClassifier() (x_train, y_train), (x_validation, y_validation) = text_classifier.split_validation_data( train, target) cleaned_x_train = TextProcessing(x_train).clean_text() cleaned_x_validation = TextProcessing(x_validation).clean_text() cleaned_x_test = TextProcessing(test).clean_text() text_classifier.fit(cleaned_x_train, y_train) text_classifier.evaluate(cleaned_x_validation, y_validation) text_classifier.confusion(y_validation, cleaned_x_validation) result = text_classifier.predict(cleaned_x_test) pd.DataFrame(result, columns=['category']).to_csv('output.csv', index=True, index_label='index')
def main(): total_start_time = time.time() # ------------------------------------------------------------------------ # # 0. PARSE INPUT ARGUMENTS # ------------------------------------------------------------------------ # data_file_name = "Crime_Weather_Cleaned_2017.csv" # data_file_name = "Crime20161718.csv" data_file_path = os.path.join(FOLDER_PATH, data_file_name) # ------------------------------------------------------------------------ # # 1. ESTABLISH DATABASE CONNECTION # ------------------------------------------------------------------------ # print("\n\n\t\t **** 1. DATABASE CONNECTION **** ") # host = 'localhost' # database = 'crime_star' # user = '******' # password = '******' # port = '3306' port = '3306' data_loader = DataLoader() ret = data_loader.connect(host=DB_IP, database=DB, user=DB_UNAME, password=DB_PWD, port=port) if ret != 1: print(" Connection not established. Try again") print(" Check internet connectivity") return ret # ------------------------------------------------------------------------ # # 2. DATA EXTRACTION PHASE # ------------------------------------------------------------------------ # print("\n\n\t\t **** 2. DATA EXTRACTION **** ") data_extractor = DataExtractor() data_frame = data_extractor.read_csv(fpath=data_file_path, nrows_to_read=-1) # ------------------------------------------------------------------------ # # 3. DATA LOADING PHASE # ------------------------------------------------------------------------ # print("\n\n\t\t **** 2. DATA LOADING **** ") ret = data_loader.load_full_table(data_frame, table_name=RAW_TABLE_NAME) if ret == -1: print(" Could not upload to database ") data_loader.disconnect() return print("Successfully populated database") # ------------------------------------------------------------------------ # # 4. DISCONNECT THE DATABASE AND CLEAN UP MEMORY # ------------------------------------------------------------------------ # data_loader.disconnect() # ------------------------------------------------------------------------ # # 5. SEND A MESSAGE TO THE DATA HUB AS AN UPDATE # ------------------------------------------------------------------------ # print(" Sending message to data hub for update....", end="") messenger = Messenger() # Connect to the data hub messenger.connect(host=DATA_HUB_IP, uname=DATA_HUB_UNAME, pwd=DATA_HUB_PWD) # Connect to the exchange messenger.connect_to_exchange(ex_name=EX_NAME) # Send update message = "Database updated with latest rows" messenger.send_message_to_exchange(ex_name=EX_NAME, message=message, topic=TOPIC) print("sent") total_end_time = time.time() print(" Total time taken :", total_end_time - total_start_time)
#!/usr/bin/python import sys from DataExtractor import DataExtractor deExtractor = DataExtractor() deExtractor.extractData(sys.argv[1])
"dim_date": dim_date, "dim_weather": dim_weather, "dim_location": dim_location, "dim_crime": dim_crime, } return star_tables if __name__ == "__main__": folder_path = "C:\\Users\\SSrih\\OneDrive\\UChicago\\DEP\\Project\\data" \ "\\Crime and Weather\\" # data_file_name = "CrimeWeather2010.csv" # data_file_name = "Crime2010Raw.csv" data_file_name = "Crime20161718.csv" data_file_path = os.path.join(folder_path, data_file_name) data_extractor = DataExtractor() data_frame = data_extractor.read_csv(fpath=data_file_path, nrows_to_read=5000) # print(data_frame.head()) data_worker = DataWorker() print(data_frame.isnull().sum().sum()) data_worker.process_pipeline(data_frame) print(data_frame.isnull().sum().sum())
def run(self): try: logInfo(self.userId, 'start crawling') de = DataExtractor() de.getServerId(self.userId) de.getWeightHistory(self.userId) de.getDietHistory(self.userId) de.getGroup(self.userId) de.getChallenge(self.userId) de.getBuddy(self.userId) logInfo(self.userId, 'Done crawling') except Exception as e: logException(self.userId, self.run.__name__, e)
from DataExtractor import DataExtractor from Geolocation import Location from DataFilter import DataFilter import json dataExObj = DataExtractor(dir='./dataset') allFiles = dataExObj.FileList(ext='.json') jsonFileDict = dataExObj.parse(allFiles) #Data Filtering #Step 1. Join reviews and business by business id's datafilter = DataFilter() data = datafilter.JoinByAttribute(jsonFileDict['business'], jsonFileDict['review'], 'business_id') #Step 2: Get the location. loc = Location() for key, value in data.items(): latitude = data[key]['latitude'] longitude = data[key]['longitude'] location = loc.GetLocation(latitude, longitude) data[key] = datafilter.merge_dicts(data[key], {'location': location}) print(data[key]['location']) #TODO Step 3 #Step 3: Filter out location outside United States.
from DataExtractor import DataExtractor if __name__ == 'main': Extractor = DataExtractor() R = Extractor.Extract("realDonaldTrump") print(R) Extractor = DataExtractor() R = Extractor.Extract("realDonaldTrump") print(R)
import os from DataExtractor import DataExtractor from StridesIdentifier import StridesIdentifier from FeaturesExtractor import FeaturesExtractor from ClassifiersEvaluator import ClassifiersEvaluator from Train import Train from InfluxCalculator import InfluxCalculator from Classifier import Classifier models_dir = '../models' # if train models folder is empty if len(os.listdir(models_dir)) == 0: # train phase #function to extract frames from videos and to create txt files with data of interest from each video de = DataExtractor('train') videos_list = de.extractor() #function to identifier strides from data (in txt files) about each video and to store strides informations in txt files si = StridesIdentifier(videos_list) si.identifier() #function to extract all features to analize about each person fe = FeaturesExtractor(videos_list) people_list = fe.extractor() #function to evaluate classifiers ce = ClassifiersEvaluator(people_list) ce.evaluator() #function to train classifiers t = Train(people_list) t.training() #test phase #function to extract frames from videos and to create txt files with data of interest from each video de = DataExtractor('test')
# Clock to time what the running of the program start_time = time.time() # Setting up directory mypath = os.getcwd() + '/data/seminar_testdata/test_untagged/' trainingPath = 'data/training' directory = os.fsencode(mypath) # Runs the ontology classification print('Running Ontology Classification: \n') ontology = Ontology() ontology.run(mypath) # Begins tagging print("\nTagging progress beginning. Get a brew, it'll take a while... \n") extractor = DataExtractor() # Trains our model extractor.train(trainingPath) tagger = Tagger() # Tags all emails in the directory given tagger.tag_seminar(mypath, directory, extractor) # Calculates how long the program took seconds = time.time() - start_time m, s = divmod(seconds, 60) print("The program has been running for {0} minutes and {1} seconds \n".format( round(m), round(s))) # Evaluates results
from sklearn.linear_model import SGDClassifier from sklearn import svm, preprocessing from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from DataExtractor import DataExtractor trainExtractor = DataExtractor(5) trainData = trainExtractor.featureDictionary for i in range(6, 10): trainData.update(DataExtractor(i).featureDictionary) trainInput = list() trainOutput = list() for value in trainData.values(): inputData = value[0] trainInput.append(inputData[0].values() + inputData[1].values()) trainOutput.append(1 if value[1] == 1 else 0) #clf = SGDClassifier(loss="log", penalty="elasticnet") #clf = SGDClassifier(loss="hinge") clf = svm.SVC() #clf = GradientBoostingClassifier(n_estimators=30, max_depth=3, subsample=.7) #clf = KNeighborsClassifier(n_neighbors=3) scaledTrainInput = preprocessing.scale(trainInput) clf.fit(trainInput, trainOutput) testExtractor = DataExtractor(10) testData = testExtractor.featureDictionary for i in range(11, 13): testData.update(DataExtractor(i).featureDictionary)
def __init__(self, filepath): # self.x_train, self.y_train = DataExtractor(filepath).split_labels() self.x_train, self.x_test, self.y_train, self.y_test = DataExtractor( filepath).split_validation()
from DataExtractor import DataExtractor import sys username = sys.argv[1] KEY = sys.argv[2] SECRET = sys.argv[3] filters = "" Extractor = DataExtractor() Extractor.Extract(username, filters, KEY, SECRET)
from DataExtractor import DataExtractor Extractor = DataExtractor() Extractor.Extract()