Пример #1
0
def run_etl(filename):
    logger.info("application ran")
    start = time.time()
    app = Extract()
    raw_data_list = app.get_data_from_bucket(filename) # extract output
    end_extract = time.time()
    extract_time = round(end_extract - start, 4)
    print(f"Extract time: {extract_time}")
    logger.info(f"Extract time: {extract_time}")
    apple = Transform()
    transformed_data, transformed_drink_menu_data = apple.transform_new_data(raw_data_list) # raw data into transform returns transformed data and drinks dic

    end_transform = time.time()
    transform_time = round(end_transform - end_extract,4)
    logger.info(f"Transform time: {transform_time}")
    print(f"Transform time: {transform_time}")
    appley = Load()

    appley.save_transaction(transformed_data) # populate RDS instance with cleaned data.
    appley.save_drink_menu(transformed_drink_menu_data) # generate drinks menu
 
    end_load = time.time()
    load_time = round(end_load - end_transform, 4)
    logger.info(f"Loading time: {load_time}")
    total_time = extract_time + transform_time + load_time
    logger.info(f"total time: {total_time}")
    print(f"Load time: {load_time}\nTotal time: {total_time}")
Пример #2
0
    def __init__(self, infos, category):

        self.infos = infos
        self.category = category
        if not os.path.exists('data/category'):
            os.makedirs('data/category')

        with open('data/category/' + category + '.csv',
                  'a',
                  newline='',
                  encoding='utf-8-sig') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    delimiter=' ',
                                    quotechar='|',
                                    quoting=csv.QUOTE_MINIMAL)
            if os.path.getsize("data/category/" + category + '.csv') == 0:
                spamwriter.writerow([
                    'product_page_url', 'universal_product_code', 'title',
                    'price_including_tax', 'prince_excluding_tax',
                    'number_available', 'product_description', 'category',
                    'review_rating', 'image_url'
                ])

            spamwriter.writerow([self.infos])

        if not os.path.exists('data/images/' + category):
            os.makedirs('data/images/' + category)

        with open(
                'data/images/' + category + "/" +
                infos[2].translate({ord(i): None
                                    for i in '<>:"“\|?/*'}) + ".jpg",
                "wb") as f:
            a = Extract(infos[9])
            f.write(a.get_url_to_download())
Пример #3
0
def etl_fact_macro_details(source_engine, target_engine):
    """fact_macro_details的etl主函数

    从235 tag_detail表etl到240 fact_macro_details表
    :param source_engine: 源数据库引擎
    :param target_engine: 目标数据库引擎
    """
    extract = Extract(source_engine, target_engine)
    transform = Transform()
    load = Load(target_engine)
    record = Record(table='fact_macro_detail', record_path='rec.cfg')

    start_params = record.get_record()
    divisions = extract.std_divisions()

    for i in range(start_params['rounds']):
        start_id = start_params['update_id'] + i * start_params['chunksize'] + 1
        end_id = start_params['update_id'] + (
            i + 1) * start_params['chunksize'] + 1

        tag_details = extract.tag_details(start_id, end_id)
        if len(tag_details) == 0:
            continue
        macro_details = transform.compile_datasets(tag_details, divisions)
        load.loading(macro_details)
        update_id = tag_details['id'].max() if tag_details['id'].max(
        ) else start_params['update_id']
        record.update_record(update_id)
Пример #4
0
def getPhrases():
    """
    """
    from extract import Extract

    ex = Extract()
    ex.run()
Пример #5
0
def etl_fact_draw_main(engine_source, engine_target,chunksize=5000,record_file='etl_fact_draw.record'):
    """绘图事实表的ETL

    :param engine_source: 源数据库引擎
    :param engine_target: 目标数据库引擎
    """
    extract = Extract(engine_source,chunksize,record_file)
    transform = Transform()
    load = Load(engine_target)
    # 抽取数据
    df_industry,df_draw_gen = extract.extract_main()
    logging.info('Extract datasets completed.')

    for k,df_draw in enumerate(df_draw_gen,1):
        logging.info('Round %d, From obs.%d to obs.%d,start.' % \
                     (k, (k-1)*chunksize, k*chunksize))
        # 清理、转换数据
        df_clean = transform.transform_main(df_industry, df_draw)
        logging.info('Round %d, Data cleaning completed.'%k)

        try:
            load.load_main(df_clean)
            logging.info('Round %d, loading %d obs. Secceed '%(k,len(df_clean)))
            with open(record_file,'w') as f:
                f.write(str(max(df_draw['id'])))
        except Exception as e:
            df_clean[['drawGuid', 'marketGuid']].to_csv('unsecceed_samples.csv', mode='a',index=False)
            logging.error('Round %d,%s' %(k,e))
            raise
def test_extract_to_list_return_np_ndarray():
    image = np.zeros((300, 300, 3))
    extract = Extract()
    list_of_faces = extract.extract_face_to_list(image)

    if type(list_of_faces[0]) is str:
        assert isinstance(list_of_faces[0], str)
    else:
        assert isinstance(list_of_faces[0], np.ndarray)
Пример #7
0
	def parseType(self,child,lastNode,isPrivate,nodes):
		if child.tag == 'private_type_declaration' and self.extractPriv:
			return None
		recNode = Extract.getRecordNode(child)
		if recNode is None:
			element = Extract.getType(child,self.sourcefile)
		else:
			element = Extract.getRecord(child)
			self.parseRecursive(recNode,element['components'],child,isPrivate)
		return element
Пример #8
0
    def __init__(self, dataSource, dataSet):
        # creating Extract class object here, to fetch data using its generic methods for APIS and CSV data sources
        extractObj = Extract()
        if dataSource == 'api':
            self.data = extractObj.getAPISData(dataSet)
            funcName = dataSource + dataSet

            # getattr function takes in function name of class and calls it.
            getattr(self, funcName)()
        else:
            print('Unknown Data Source!')
Пример #9
0
	def __init__(self):
		self.logCPF = None
		self.logNumber = None
		self.extract = Extract()
		Connection.__init__(self)
		sql = 'create table if not exists db_client (CPF integer primary key, Name varchar(45) not null, Surname varchar(45) not null)'
		self.execute(sql)
		self.commit()
		sql = 'create table if not exists db_account (Number integer primary key, Name varchar(45) not null, Surname varchar(45) not null, CPF integer, Balance real not null, Password varchar(45) not null, Limits real not null)'
		self.execute(sql)
		self.commit()
Пример #10
0
def etl_dimension_time(target_engine):
    """时间维度表主函数

    :param target_engine: 目标数据库引擎
    """
    extract = Extract()
    transform = Transform()
    load = Load(target_engine)

    full_time = extract.gen_full_time()
    time_table = transform.gen_date(full_time)
    load.loading(time_table)
Пример #11
0
    def __init__(self, number, client, balance, password, limit=1000.0):
        '''
		DESCRIPTION:
			Essa classe é utilizada a fim de cadastrar a conta que o cliente vai utilizar no banco.
		'''
        self._number = number
        self._holder = client
        self._balance = balance
        self._limit = limit
        self._password = password
        self._extract = Extract()
        Account._total_accounts += 1
Пример #12
0
    def __init__(self,dataSource,dataSet):
        extractObj = Extract()

        if dataSource == 'api':
            self.data = extractObj.getApiData(dataSet)
            funcName =  dataSource + dataSet
            # getattr function takes in function name of class and calls it.
            getattr(self,funcName)()

        elif dataSource == 'csv':
            self.data = extractObj.getCsvData(dataSet)
            funcName = dataSource +dataSet
            getattr(self, funcName)()
Пример #13
0
    def get_subreddits_links_to_build_task(self):
        base_ = Base()
        extract_ = Extract()
        list_subreddits_data = base_.get_data_list_subreddits()
        downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
        urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \
            start_date=self.st_dt, end_date=self.end_dt)
        if len(downloaded_subs) > 0:
            urls = list(set(urls)- set(downloaded_subs))
            print("Already Dowloaded {} sub-reddits yet to download {} sub-reddits".format(len(downloaded_subs), len(urls)))
            print("Completed {}%".format(len(downloaded_subs)/len(urls)))

        return urls
Пример #14
0
 def act_extract(self):
     if self.filepath_in is None or not os.path.exists(self.filepath_in):
         self.act_open()
     # initialize
     self.combobox.clear()
     self.combobox.addItem('原件')
     self.combobox.addItem(r'提取的文件')
     self.combobox.setCurrentIndex(1)
     #
     extract = Extract(self.filepath_in)
     extract.process(output_filepath=self.filepath_extract)
     #db['project_info'].set_db(extract.extract_project_infos())
     self.refresh_left_preview(self.filepath_extract)
Пример #15
0
def etl_demension_division(target_engine):
    """division表的etl主函数

    从统计局爬取的标准csv表中抽取数据,载入到数据仓库
    :param target_engine:目标数据库引擎
    """
    extract = Extract()
    transform = Transform()
    load = Load(target_engine)
    logging.info('Initialize three instances')

    division_datasets = extract.std_divisions()
    std_districts = transform.std_districts(division_datasets)
    load.loading(std_districts)
Пример #16
0
    def __init__(self, datasource, dataset):
        self.csv_df = pd.DataFrame()

        # create the Extract object
        extract_obj = Extract()

        if datasource == 'api':
            self.data = extract_obj.get_api_data(dataset)
            func_name = datasource + "_" + dataset
            getattr(self, func_name)()
        elif datasource == 'csv':
            self.data = extract_obj.get_csv_data(dataset)
            func_name = datasource + "_" + dataset
            getattr(self, func_name)()
        else:
            print('Unknown data source!!! Please try again...')
Пример #17
0
def main():

    # ticker = 'GME' # Gamestop
    ticker = 'BTC-USD'  # Bitcoin
    # ticker = '^GSPC' # S&P 500 index

    start = datetime.date(2021, 1, 1)  # start looking at the stock
    ext = Extract(ticker=ticker, start=start)
    data = ext.read_data()
    ext.convert_csv(data)

    fst = Forest(ticker=ticker, start=start)
    fst.model_outliers()

    ans = Analysis(ticker=ticker, start=start)
    ans.plot_outliers()
Пример #18
0
    def __init__(self, data_source, data_set):

        extract_obj = Extract()
        func_name = f"{data_source}_{data_set}"

        if data_source == 'api':
            self.data = extract_obj.get_api_data(data_set)

        elif data_source == 'csv':
            self.data = extract_obj.get_csv_data(data_set)

        else:
            print('Unkown Data Source')
            return

        # getattr function takes in function name of class and calls it.
        getattr(self, func_name)()
Пример #19
0
def etl_fact_market(source_engine, target_engine, rec_path):

    extract = Extract(source_engine, target_engine)
    transform = Transform()
    load = Load(target_engine)
    record = Record('rec.cfg')

    start_params = record.get_record()
    unique_marketguid = []
    done_market = []
    has_dealed = []

    for i, grandParentId in enumerate(unique_marketguid):

        if len(grandParentId) != 36:  # 判断grandParentId的有效性
            logging.error('Round %d, %s is not valid.' % (i, grandParentId))
            continue

        elif grandParentId in done_market:  # 判断该商圈是已经经过etl
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue

        if grandParentId in has_dealed:
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue
        else:
            has_dealed.append(grandParentId)

        zone_grandparent = extract.zone_grandparent(grandParentId)
        if len(zone_grandparent) == 0:
            logging.warning('Round %d, has no draw samples' % i)
            continue

        rent = extract.rent_details(grandParentId)
        industry_tmp = industry[industry['grandParentId'] == grandParentId]
        # 转换数据
        rent = transform.rent_calculate(rent)
        industry_dict = transform.reshape_industry(industry_tmp)
        # 组合数据
        clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict,
                                      zone_grandparent)
        try:
            load.loading(clean)
        except Exception as e:
            logging.error('Round %d, %s' % (i, e))
Пример #20
0
 def extract_descriptions(self, widget, device):
     print "extract xml descriptions", widget, device
     from extract import Extract
     id = '@'.join((device.get_usn(), 'DeviceXMlExtract'))
     try:
         self.windows[id].show()
     except:
         ui = Extract(device)
         self.windows[id] = ui.window
Пример #21
0
    def __init__(self, dataSource, dataSet):

        # creating Extract class object here, to fetch data using its generic methods for APIS and CSV data sources
        extractObj = Extract()

        if dataSource == 'api':
            self.data = extractObj.getAPIsData(dataSet)
            funcName = dataSource + dataSet

            # getattr function takes in function name of class and calls it.
            getattr(self, funcName)()
        else:
            print('Unkown Data Source!!! Please try again...')

    # Economy Data Transformation

        def apiEconomy(self):
            gdp_india = {}
            for record in self.data['records']:
                gdp = {}

                # taking out yearly GDP value from records
                gdp['GDP_in_rs_cr'] = int(
                    record['gross_domestic_product_in_rs_cr_at_2004_05_prices']
                )
                gdp_india[record['financial_year']] = gdp
                gdp_india_yrs = list(gdp_india)

            for i in range(len(gdp_india_yrs)):
                if i == 0:
                    pass
                else:
                    key = 'GDP_Growth_' + gdp_india_yrs[i]
                    # calculating GDP growth on yearly basis
                    gdp_india[gdp_india_yrs[i]][key] = round(
                        ((gdp_india[gdp_india_yrs[i]]['GDP_in_rs_cr'] -
                          gdp_india[gdp_india_yrs[i - 1]]['GDP_in_rs_cr']) /
                         gdp_india[gdp_india_yrs[i - 1]]['GDP_in_rs_cr']) *
                        100, 2)

        # connection to mongo db,
        mongoDB_obj = MongoDB('GDP')
        # Insert Data into MongoDB
        mongoDB_obj.insert_into_db(gdp_india, 'India_GDP')
Пример #22
0
    def read_mail(self, view, attachment=False):
        """Read the latest mail
            @param view
             The view(fold) to access
            @param attachment
             Boolean, whether get attachment
            @return, dict
             Info of a mail
        """
        result = {}

        documents = self.get_documents(view)
        latest_document = documents[-1:][0]
        extra_obj = Extract(latest_document)
        result = extra_obj.extract()
        if attachment:
            extra_obj.get_attachment()

        return result
Пример #23
0
	def parseRecursive(self,node,elements,parent=None,isPrivate=False):
		lastNode = None
		
		i = 0
		for child in node:
			element = None
			has_body_declarative_items_ql = False
			if child.tag in ['procedure_body_declaration','function_body_declaration']:
				element = Extract.getFunction(child,lastNode,self.sourcefile)
			elif child.tag in ['function_declaration','generic_function_declaration','procedure_declaration','generic_procedure_declaration','single_task_declaration','task_type_declaration','protected_type_declaration','single_protected_declaration']:
				element = Extract.getFunctionHead(child,lastNode,self.sourcefile,self.extractPriv)
			elif child.tag in ['ordinary_type_declaration','subtype_declaration','private_type_declaration']:
				element = self.parseType(child,lastNode,isPrivate,node)
			elif child.tag in ['component_declaration']:
				element = Extract.getRecordComponent(child,self.sourcefile)
			elif child.tag == 'package_body_declaration':
				element = Extract.getPackage(child,self.prefixClass,lastNode)
			elif child.tag in ['generic_package_declaration','package_declaration']:
				element = self.parsePackage(child,lastNode,isPrivate)
			elif child.tag == 'package_renaming_declaration':
				element = Extract.getRename(child)
			elif child.tag in ['attribute_definition_clause','record_representation_clause','enumeration_representation_clause'] and self.hideRepClause is False:
				element = Extract.getRepClause(child,self.prefixRepClause,self.sourcefile)
			elif child.tag in ['import_pragma']:
				self.imports.append(Extract.getImport(child,self.sourcefile))
			elif child.tag not in ['implementation_defined_pragma']: 
				logging.info("Not parsed: "+child.tag)
				
			if element is not None:
				if parent is None: 
					c = Extract.getUnitComment(self.root)
				else: 
					c = Extract.getComment(node,i) 
				element['comment'] = c
				element['is_private'] = isPrivate
				element['is_extract'] = c != '' or self.extractAll or parent == None
				
				if 'uri' in element: self.elementsByUris[element['uri']] = element
				elements.append(element)
				if element['has_childs'] and child.find('body_declarative_items_ql') is not None:
					isPrivateSubLevel = isPrivate
					if element['type'] == 'function': 
						isPrivateSubLevel = True
					self.parseRecursive(child.find('body_declarative_items_ql'),element['childs'],child,isPrivateSubLevel)
	
			i+=1
			lastNode = child
Пример #24
0
def main():
    logger.info("\t EXTRACT")
    # EXTRACT
    extraction = Extract(url, file_name, bucket_name, path)

    # download files
    extraction.download_url()

    # Upload to S3 (mocking)
    s3_obj = S3mocking(file_name, bucket_name)
    s3_obj.load_save()

    # TRANSFORM
    logger.info("\t TRANSFORM")
    pfn = Transform(path, file_name).main()

    # LOAD
    s3_load = S3mocking(pfn, bucket_name)
    logger.info("\t LOAD")
    s3_load.load_save()
Пример #25
0
	def parsePackage(self,child,lastNode,isPrivate):
		
		element = Extract.getPackage(child,self.prefixClass,lastNode)
		element['has_childs'] = False
		element['public'] = []
		element['private'] = []
		if child.find('visible_part_declarative_items_ql') is not None:
			self.parseRecursive(child.find('visible_part_declarative_items_ql'),element['public'],child,isPrivate)
		if child.find('private_part_declarative_items_ql') is not None:
			self.parseRecursive(child.find('private_part_declarative_items_ql'),element['private'],child,True)
		return element
Пример #26
0
def etl(game, extract_date, data_dir=DATA_DIR, db=load.DB_FILENAME):
    logger.info('Start ETL for game {0}'.format(game))
    load_date = datetime.today()
    data_dir = os.path.join(data_dir, game)
    if game == 'hb':
        trans_fun = Transform.hb_transform
    elif game == 'wwc':
        trans_fun = Transform.wwc_transform

    data = Extract.extract(data_dir, extract_date)
    data = Transform.transform(data, trans_fun, extract_date, load_date)
    Load.load(data, game, db=db)
Пример #27
0
 def run_extraction(self):
     extract_ = Extract()
     base_ = Base()
     list_subreddits_data = base_.get_data_list_subreddits()
     downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
     if len(downloaded_subs) > 0:
         remianing_list = list(
             set(list_subreddits_data) - set(downloaded_subs))
         print(
             "Already Dowloaded {} sub-reddits yet to download {} sub-reddits"
             .format(len(downloaded_subs), len(remianing_list)))
         print("Completed {}%".format(
             len(downloaded_subs) / len(list_subreddits_data)))
         list_subreddits_data = remianing_list
     start_time = time.time()
     cost = 0
     for subreddit in list_subreddits_data:
         start_time, cost = extract_.start_extraction(subreddit=subreddit, start_date=self.st_dt, end_date=self.end_dt, \
             base_path=self.sav_path, start_time=start_time, total_cost=cost)
         print(cost)
         print(start_time)
Пример #28
0
class BigGiant:
    def __init__(self, pkl_path, model_path):
        self.pkl_path = pkl_path
        self.model_path = model_path
        self.predict = Predict(self.pkl_path, self.model_path)
        self.extract = Extract()
        self.mtcnn = MTCNN()

    def extract_face_to_list(self, image):
        list_of_faces = self.extract.extract_face_to_list(image)
        return list_of_faces

    def predict_face(self, face):
        data = self.predict.predict_face(face)
        return data

    def read_image(self, image_path):
        image = self.extract.read_image(image_path)
        return image

    def aggregate_face_data(self, data, image_url):
        pass
Пример #29
0
def etl_fact_market(*args):
    """fact_market表主函数
    
    :param args: 按位参数engine_zone_macro,engine_draw,engine_target
    """
    # 初始化 extract,transform和load三个对象
    extract = Extract(engine_zone_macro, engine_draw, engine_target)
    transform = Transform()
    load = Load(engine_target)

    # 抽取已经经过etl的商圈
    done_market = extract.done_market()
    df_tag_counts = extract.tag_counts()
    df_industry = extract.industry()
    has_dealed = []

    for i, sample_tag_counts in df_tag_counts.iterrows():

        grandParentId = sample_tag_counts['grandParentId']
        if len(grandParentId) != 36:  # 判断grandParentId的有效性
            logging.warning('Round %d, %s is invalid ,skipped.' %
                            (i, grandParentId))
            continue

        elif grandParentId in done_market:  # 判断该商圈是已经经过etl
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue

        if grandParentId in has_dealed:
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue
        else:
            has_dealed.append(grandParentId)

        # 抽取数据
        zone_grandparent = extract.zone_grandparent(grandParentId)
        if len(zone_grandparent) == 0:
            logging.warning('Round %d, has no draw samples' % i)
            continue
        rent = extract.rent_details(grandParentId)
        industry_tmp = df_industry[df_industry['grandParentId'] ==
                                   grandParentId]
        # 转换数据
        rent = transform.rent_calculate(rent)
        industry_dict = transform.reshape_industry(industry_tmp)
        # 组合数据
        clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict,
                                      zone_grandparent)
        try:
            load.loading(clean)
            logging.info('Round %d, %s etl secceed' % (i, grandParentId))
        except Exception as e:
            logging.error('Round %d, %s' % (i, e))
Пример #30
0
def market_to_api2(source, target, record_file='api2.record'):
    """anti_fraud数据库api2表的etl主函数
    
    :param source: 源数据库引擎
    :param target: 目标数据库引擎
    :param record_file: 负责记录装载id的文件名,默认为 app2.record
    """
    # 初始化对象
    extract = Extract(source, target, record_file)
    transform = Transform()
    load = Load(target, record_file)

    # 抽取数据
    market_df = extract.market()
    draw_samples = extract.draw_samples()

    # 转换数据
    reshaped_market = transform.reshape_market(market_df)
    aggregated_samples = transform.aggregate_from_samples(draw_samples)
    api2_df = transform.compile_dfs(reshaped_market, aggregated_samples)

    # 装载数据
    load.loading(api2_df)
Пример #31
0
def MatchAll(dDir, mDir, appDomains, pathPrefix=None, URL_CLUSTER=False):
    d = Extract("Desktop", dDir, "*.json", appDomains)
    m = Extract("Mobile", mDir, "*.json", appDomains)

    config = {"domains": appDomains, "prefix":pathPrefix, "URL_CLUSTER":URL_CLUSTER}
    ActionRecognizer(d, m).setConfig(config).run()

    if utils.verbose:
        print "*"*80
        print "Labelled Desktop Traces"
        d.printLabeledTraces()
        print "*"*80
        print "Labelled Mobile Traces"
        m.printLabeledTraces()

    if URL_CLUSTER:
        mapping = MatchURLTraces(d,m)
    else:
        mapping = MatchTraces(d, m)