def run_etl(filename): logger.info("application ran") start = time.time() app = Extract() raw_data_list = app.get_data_from_bucket(filename) # extract output end_extract = time.time() extract_time = round(end_extract - start, 4) print(f"Extract time: {extract_time}") logger.info(f"Extract time: {extract_time}") apple = Transform() transformed_data, transformed_drink_menu_data = apple.transform_new_data(raw_data_list) # raw data into transform returns transformed data and drinks dic end_transform = time.time() transform_time = round(end_transform - end_extract,4) logger.info(f"Transform time: {transform_time}") print(f"Transform time: {transform_time}") appley = Load() appley.save_transaction(transformed_data) # populate RDS instance with cleaned data. appley.save_drink_menu(transformed_drink_menu_data) # generate drinks menu end_load = time.time() load_time = round(end_load - end_transform, 4) logger.info(f"Loading time: {load_time}") total_time = extract_time + transform_time + load_time logger.info(f"total time: {total_time}") print(f"Load time: {load_time}\nTotal time: {total_time}")
def __init__(self, infos, category): self.infos = infos self.category = category if not os.path.exists('data/category'): os.makedirs('data/category') with open('data/category/' + category + '.csv', 'a', newline='', encoding='utf-8-sig') as csvfile: spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) if os.path.getsize("data/category/" + category + '.csv') == 0: spamwriter.writerow([ 'product_page_url', 'universal_product_code', 'title', 'price_including_tax', 'prince_excluding_tax', 'number_available', 'product_description', 'category', 'review_rating', 'image_url' ]) spamwriter.writerow([self.infos]) if not os.path.exists('data/images/' + category): os.makedirs('data/images/' + category) with open( 'data/images/' + category + "/" + infos[2].translate({ord(i): None for i in '<>:"“\|?/*'}) + ".jpg", "wb") as f: a = Extract(infos[9]) f.write(a.get_url_to_download())
def etl_fact_macro_details(source_engine, target_engine): """fact_macro_details的etl主函数 从235 tag_detail表etl到240 fact_macro_details表 :param source_engine: 源数据库引擎 :param target_engine: 目标数据库引擎 """ extract = Extract(source_engine, target_engine) transform = Transform() load = Load(target_engine) record = Record(table='fact_macro_detail', record_path='rec.cfg') start_params = record.get_record() divisions = extract.std_divisions() for i in range(start_params['rounds']): start_id = start_params['update_id'] + i * start_params['chunksize'] + 1 end_id = start_params['update_id'] + ( i + 1) * start_params['chunksize'] + 1 tag_details = extract.tag_details(start_id, end_id) if len(tag_details) == 0: continue macro_details = transform.compile_datasets(tag_details, divisions) load.loading(macro_details) update_id = tag_details['id'].max() if tag_details['id'].max( ) else start_params['update_id'] record.update_record(update_id)
def getPhrases(): """ """ from extract import Extract ex = Extract() ex.run()
def etl_fact_draw_main(engine_source, engine_target,chunksize=5000,record_file='etl_fact_draw.record'): """绘图事实表的ETL :param engine_source: 源数据库引擎 :param engine_target: 目标数据库引擎 """ extract = Extract(engine_source,chunksize,record_file) transform = Transform() load = Load(engine_target) # 抽取数据 df_industry,df_draw_gen = extract.extract_main() logging.info('Extract datasets completed.') for k,df_draw in enumerate(df_draw_gen,1): logging.info('Round %d, From obs.%d to obs.%d,start.' % \ (k, (k-1)*chunksize, k*chunksize)) # 清理、转换数据 df_clean = transform.transform_main(df_industry, df_draw) logging.info('Round %d, Data cleaning completed.'%k) try: load.load_main(df_clean) logging.info('Round %d, loading %d obs. Secceed '%(k,len(df_clean))) with open(record_file,'w') as f: f.write(str(max(df_draw['id']))) except Exception as e: df_clean[['drawGuid', 'marketGuid']].to_csv('unsecceed_samples.csv', mode='a',index=False) logging.error('Round %d,%s' %(k,e)) raise
def test_extract_to_list_return_np_ndarray(): image = np.zeros((300, 300, 3)) extract = Extract() list_of_faces = extract.extract_face_to_list(image) if type(list_of_faces[0]) is str: assert isinstance(list_of_faces[0], str) else: assert isinstance(list_of_faces[0], np.ndarray)
def parseType(self,child,lastNode,isPrivate,nodes): if child.tag == 'private_type_declaration' and self.extractPriv: return None recNode = Extract.getRecordNode(child) if recNode is None: element = Extract.getType(child,self.sourcefile) else: element = Extract.getRecord(child) self.parseRecursive(recNode,element['components'],child,isPrivate) return element
def __init__(self, dataSource, dataSet): # creating Extract class object here, to fetch data using its generic methods for APIS and CSV data sources extractObj = Extract() if dataSource == 'api': self.data = extractObj.getAPISData(dataSet) funcName = dataSource + dataSet # getattr function takes in function name of class and calls it. getattr(self, funcName)() else: print('Unknown Data Source!')
def __init__(self): self.logCPF = None self.logNumber = None self.extract = Extract() Connection.__init__(self) sql = 'create table if not exists db_client (CPF integer primary key, Name varchar(45) not null, Surname varchar(45) not null)' self.execute(sql) self.commit() sql = 'create table if not exists db_account (Number integer primary key, Name varchar(45) not null, Surname varchar(45) not null, CPF integer, Balance real not null, Password varchar(45) not null, Limits real not null)' self.execute(sql) self.commit()
def etl_dimension_time(target_engine): """时间维度表主函数 :param target_engine: 目标数据库引擎 """ extract = Extract() transform = Transform() load = Load(target_engine) full_time = extract.gen_full_time() time_table = transform.gen_date(full_time) load.loading(time_table)
def __init__(self, number, client, balance, password, limit=1000.0): ''' DESCRIPTION: Essa classe é utilizada a fim de cadastrar a conta que o cliente vai utilizar no banco. ''' self._number = number self._holder = client self._balance = balance self._limit = limit self._password = password self._extract = Extract() Account._total_accounts += 1
def __init__(self,dataSource,dataSet): extractObj = Extract() if dataSource == 'api': self.data = extractObj.getApiData(dataSet) funcName = dataSource + dataSet # getattr function takes in function name of class and calls it. getattr(self,funcName)() elif dataSource == 'csv': self.data = extractObj.getCsvData(dataSet) funcName = dataSource +dataSet getattr(self, funcName)()
def get_subreddits_links_to_build_task(self): base_ = Base() extract_ = Extract() list_subreddits_data = base_.get_data_list_subreddits() downloaded_subs = base_.check_resume_file(file_path=self.resume_file) urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \ start_date=self.st_dt, end_date=self.end_dt) if len(downloaded_subs) > 0: urls = list(set(urls)- set(downloaded_subs)) print("Already Dowloaded {} sub-reddits yet to download {} sub-reddits".format(len(downloaded_subs), len(urls))) print("Completed {}%".format(len(downloaded_subs)/len(urls))) return urls
def act_extract(self): if self.filepath_in is None or not os.path.exists(self.filepath_in): self.act_open() # initialize self.combobox.clear() self.combobox.addItem('原件') self.combobox.addItem(r'提取的文件') self.combobox.setCurrentIndex(1) # extract = Extract(self.filepath_in) extract.process(output_filepath=self.filepath_extract) #db['project_info'].set_db(extract.extract_project_infos()) self.refresh_left_preview(self.filepath_extract)
def etl_demension_division(target_engine): """division表的etl主函数 从统计局爬取的标准csv表中抽取数据,载入到数据仓库 :param target_engine:目标数据库引擎 """ extract = Extract() transform = Transform() load = Load(target_engine) logging.info('Initialize three instances') division_datasets = extract.std_divisions() std_districts = transform.std_districts(division_datasets) load.loading(std_districts)
def __init__(self, datasource, dataset): self.csv_df = pd.DataFrame() # create the Extract object extract_obj = Extract() if datasource == 'api': self.data = extract_obj.get_api_data(dataset) func_name = datasource + "_" + dataset getattr(self, func_name)() elif datasource == 'csv': self.data = extract_obj.get_csv_data(dataset) func_name = datasource + "_" + dataset getattr(self, func_name)() else: print('Unknown data source!!! Please try again...')
def main(): # ticker = 'GME' # Gamestop ticker = 'BTC-USD' # Bitcoin # ticker = '^GSPC' # S&P 500 index start = datetime.date(2021, 1, 1) # start looking at the stock ext = Extract(ticker=ticker, start=start) data = ext.read_data() ext.convert_csv(data) fst = Forest(ticker=ticker, start=start) fst.model_outliers() ans = Analysis(ticker=ticker, start=start) ans.plot_outliers()
def __init__(self, data_source, data_set): extract_obj = Extract() func_name = f"{data_source}_{data_set}" if data_source == 'api': self.data = extract_obj.get_api_data(data_set) elif data_source == 'csv': self.data = extract_obj.get_csv_data(data_set) else: print('Unkown Data Source') return # getattr function takes in function name of class and calls it. getattr(self, func_name)()
def etl_fact_market(source_engine, target_engine, rec_path): extract = Extract(source_engine, target_engine) transform = Transform() load = Load(target_engine) record = Record('rec.cfg') start_params = record.get_record() unique_marketguid = [] done_market = [] has_dealed = [] for i, grandParentId in enumerate(unique_marketguid): if len(grandParentId) != 36: # 判断grandParentId的有效性 logging.error('Round %d, %s is not valid.' % (i, grandParentId)) continue elif grandParentId in done_market: # 判断该商圈是已经经过etl logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue if grandParentId in has_dealed: logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue else: has_dealed.append(grandParentId) zone_grandparent = extract.zone_grandparent(grandParentId) if len(zone_grandparent) == 0: logging.warning('Round %d, has no draw samples' % i) continue rent = extract.rent_details(grandParentId) industry_tmp = industry[industry['grandParentId'] == grandParentId] # 转换数据 rent = transform.rent_calculate(rent) industry_dict = transform.reshape_industry(industry_tmp) # 组合数据 clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict, zone_grandparent) try: load.loading(clean) except Exception as e: logging.error('Round %d, %s' % (i, e))
def extract_descriptions(self, widget, device): print "extract xml descriptions", widget, device from extract import Extract id = '@'.join((device.get_usn(), 'DeviceXMlExtract')) try: self.windows[id].show() except: ui = Extract(device) self.windows[id] = ui.window
def __init__(self, dataSource, dataSet): # creating Extract class object here, to fetch data using its generic methods for APIS and CSV data sources extractObj = Extract() if dataSource == 'api': self.data = extractObj.getAPIsData(dataSet) funcName = dataSource + dataSet # getattr function takes in function name of class and calls it. getattr(self, funcName)() else: print('Unkown Data Source!!! Please try again...') # Economy Data Transformation def apiEconomy(self): gdp_india = {} for record in self.data['records']: gdp = {} # taking out yearly GDP value from records gdp['GDP_in_rs_cr'] = int( record['gross_domestic_product_in_rs_cr_at_2004_05_prices'] ) gdp_india[record['financial_year']] = gdp gdp_india_yrs = list(gdp_india) for i in range(len(gdp_india_yrs)): if i == 0: pass else: key = 'GDP_Growth_' + gdp_india_yrs[i] # calculating GDP growth on yearly basis gdp_india[gdp_india_yrs[i]][key] = round( ((gdp_india[gdp_india_yrs[i]]['GDP_in_rs_cr'] - gdp_india[gdp_india_yrs[i - 1]]['GDP_in_rs_cr']) / gdp_india[gdp_india_yrs[i - 1]]['GDP_in_rs_cr']) * 100, 2) # connection to mongo db, mongoDB_obj = MongoDB('GDP') # Insert Data into MongoDB mongoDB_obj.insert_into_db(gdp_india, 'India_GDP')
def read_mail(self, view, attachment=False): """Read the latest mail @param view The view(fold) to access @param attachment Boolean, whether get attachment @return, dict Info of a mail """ result = {} documents = self.get_documents(view) latest_document = documents[-1:][0] extra_obj = Extract(latest_document) result = extra_obj.extract() if attachment: extra_obj.get_attachment() return result
def parseRecursive(self,node,elements,parent=None,isPrivate=False): lastNode = None i = 0 for child in node: element = None has_body_declarative_items_ql = False if child.tag in ['procedure_body_declaration','function_body_declaration']: element = Extract.getFunction(child,lastNode,self.sourcefile) elif child.tag in ['function_declaration','generic_function_declaration','procedure_declaration','generic_procedure_declaration','single_task_declaration','task_type_declaration','protected_type_declaration','single_protected_declaration']: element = Extract.getFunctionHead(child,lastNode,self.sourcefile,self.extractPriv) elif child.tag in ['ordinary_type_declaration','subtype_declaration','private_type_declaration']: element = self.parseType(child,lastNode,isPrivate,node) elif child.tag in ['component_declaration']: element = Extract.getRecordComponent(child,self.sourcefile) elif child.tag == 'package_body_declaration': element = Extract.getPackage(child,self.prefixClass,lastNode) elif child.tag in ['generic_package_declaration','package_declaration']: element = self.parsePackage(child,lastNode,isPrivate) elif child.tag == 'package_renaming_declaration': element = Extract.getRename(child) elif child.tag in ['attribute_definition_clause','record_representation_clause','enumeration_representation_clause'] and self.hideRepClause is False: element = Extract.getRepClause(child,self.prefixRepClause,self.sourcefile) elif child.tag in ['import_pragma']: self.imports.append(Extract.getImport(child,self.sourcefile)) elif child.tag not in ['implementation_defined_pragma']: logging.info("Not parsed: "+child.tag) if element is not None: if parent is None: c = Extract.getUnitComment(self.root) else: c = Extract.getComment(node,i) element['comment'] = c element['is_private'] = isPrivate element['is_extract'] = c != '' or self.extractAll or parent == None if 'uri' in element: self.elementsByUris[element['uri']] = element elements.append(element) if element['has_childs'] and child.find('body_declarative_items_ql') is not None: isPrivateSubLevel = isPrivate if element['type'] == 'function': isPrivateSubLevel = True self.parseRecursive(child.find('body_declarative_items_ql'),element['childs'],child,isPrivateSubLevel) i+=1 lastNode = child
def main(): logger.info("\t EXTRACT") # EXTRACT extraction = Extract(url, file_name, bucket_name, path) # download files extraction.download_url() # Upload to S3 (mocking) s3_obj = S3mocking(file_name, bucket_name) s3_obj.load_save() # TRANSFORM logger.info("\t TRANSFORM") pfn = Transform(path, file_name).main() # LOAD s3_load = S3mocking(pfn, bucket_name) logger.info("\t LOAD") s3_load.load_save()
def parsePackage(self,child,lastNode,isPrivate): element = Extract.getPackage(child,self.prefixClass,lastNode) element['has_childs'] = False element['public'] = [] element['private'] = [] if child.find('visible_part_declarative_items_ql') is not None: self.parseRecursive(child.find('visible_part_declarative_items_ql'),element['public'],child,isPrivate) if child.find('private_part_declarative_items_ql') is not None: self.parseRecursive(child.find('private_part_declarative_items_ql'),element['private'],child,True) return element
def etl(game, extract_date, data_dir=DATA_DIR, db=load.DB_FILENAME): logger.info('Start ETL for game {0}'.format(game)) load_date = datetime.today() data_dir = os.path.join(data_dir, game) if game == 'hb': trans_fun = Transform.hb_transform elif game == 'wwc': trans_fun = Transform.wwc_transform data = Extract.extract(data_dir, extract_date) data = Transform.transform(data, trans_fun, extract_date, load_date) Load.load(data, game, db=db)
def run_extraction(self): extract_ = Extract() base_ = Base() list_subreddits_data = base_.get_data_list_subreddits() downloaded_subs = base_.check_resume_file(file_path=self.resume_file) if len(downloaded_subs) > 0: remianing_list = list( set(list_subreddits_data) - set(downloaded_subs)) print( "Already Dowloaded {} sub-reddits yet to download {} sub-reddits" .format(len(downloaded_subs), len(remianing_list))) print("Completed {}%".format( len(downloaded_subs) / len(list_subreddits_data))) list_subreddits_data = remianing_list start_time = time.time() cost = 0 for subreddit in list_subreddits_data: start_time, cost = extract_.start_extraction(subreddit=subreddit, start_date=self.st_dt, end_date=self.end_dt, \ base_path=self.sav_path, start_time=start_time, total_cost=cost) print(cost) print(start_time)
class BigGiant: def __init__(self, pkl_path, model_path): self.pkl_path = pkl_path self.model_path = model_path self.predict = Predict(self.pkl_path, self.model_path) self.extract = Extract() self.mtcnn = MTCNN() def extract_face_to_list(self, image): list_of_faces = self.extract.extract_face_to_list(image) return list_of_faces def predict_face(self, face): data = self.predict.predict_face(face) return data def read_image(self, image_path): image = self.extract.read_image(image_path) return image def aggregate_face_data(self, data, image_url): pass
def etl_fact_market(*args): """fact_market表主函数 :param args: 按位参数engine_zone_macro,engine_draw,engine_target """ # 初始化 extract,transform和load三个对象 extract = Extract(engine_zone_macro, engine_draw, engine_target) transform = Transform() load = Load(engine_target) # 抽取已经经过etl的商圈 done_market = extract.done_market() df_tag_counts = extract.tag_counts() df_industry = extract.industry() has_dealed = [] for i, sample_tag_counts in df_tag_counts.iterrows(): grandParentId = sample_tag_counts['grandParentId'] if len(grandParentId) != 36: # 判断grandParentId的有效性 logging.warning('Round %d, %s is invalid ,skipped.' % (i, grandParentId)) continue elif grandParentId in done_market: # 判断该商圈是已经经过etl logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue if grandParentId in has_dealed: logging.warning('Round %d, %s etl before' % (i, grandParentId)) continue else: has_dealed.append(grandParentId) # 抽取数据 zone_grandparent = extract.zone_grandparent(grandParentId) if len(zone_grandparent) == 0: logging.warning('Round %d, has no draw samples' % i) continue rent = extract.rent_details(grandParentId) industry_tmp = df_industry[df_industry['grandParentId'] == grandParentId] # 转换数据 rent = transform.rent_calculate(rent) industry_dict = transform.reshape_industry(industry_tmp) # 组合数据 clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict, zone_grandparent) try: load.loading(clean) logging.info('Round %d, %s etl secceed' % (i, grandParentId)) except Exception as e: logging.error('Round %d, %s' % (i, e))
def market_to_api2(source, target, record_file='api2.record'): """anti_fraud数据库api2表的etl主函数 :param source: 源数据库引擎 :param target: 目标数据库引擎 :param record_file: 负责记录装载id的文件名,默认为 app2.record """ # 初始化对象 extract = Extract(source, target, record_file) transform = Transform() load = Load(target, record_file) # 抽取数据 market_df = extract.market() draw_samples = extract.draw_samples() # 转换数据 reshaped_market = transform.reshape_market(market_df) aggregated_samples = transform.aggregate_from_samples(draw_samples) api2_df = transform.compile_dfs(reshaped_market, aggregated_samples) # 装载数据 load.loading(api2_df)
def MatchAll(dDir, mDir, appDomains, pathPrefix=None, URL_CLUSTER=False): d = Extract("Desktop", dDir, "*.json", appDomains) m = Extract("Mobile", mDir, "*.json", appDomains) config = {"domains": appDomains, "prefix":pathPrefix, "URL_CLUSTER":URL_CLUSTER} ActionRecognizer(d, m).setConfig(config).run() if utils.verbose: print "*"*80 print "Labelled Desktop Traces" d.printLabeledTraces() print "*"*80 print "Labelled Mobile Traces" m.printLabeledTraces() if URL_CLUSTER: mapping = MatchURLTraces(d,m) else: mapping = MatchTraces(d, m)