Exemplo n.º 1
0
def run_etl(filename):
    logger.info("application ran")
    start = time.time()
    app = Extract()
    raw_data_list = app.get_data_from_bucket(filename) # extract output
    end_extract = time.time()
    extract_time = round(end_extract - start, 4)
    print(f"Extract time: {extract_time}")
    logger.info(f"Extract time: {extract_time}")
    apple = Transform()
    transformed_data, transformed_drink_menu_data = apple.transform_new_data(raw_data_list) # raw data into transform returns transformed data and drinks dic

    end_transform = time.time()
    transform_time = round(end_transform - end_extract,4)
    logger.info(f"Transform time: {transform_time}")
    print(f"Transform time: {transform_time}")
    appley = Load()

    appley.save_transaction(transformed_data) # populate RDS instance with cleaned data.
    appley.save_drink_menu(transformed_drink_menu_data) # generate drinks menu
 
    end_load = time.time()
    load_time = round(end_load - end_transform, 4)
    logger.info(f"Loading time: {load_time}")
    total_time = extract_time + transform_time + load_time
    logger.info(f"total time: {total_time}")
    print(f"Load time: {load_time}\nTotal time: {total_time}")
Exemplo n.º 2
0
    def __init__(self, infos, category):

        self.infos = infos
        self.category = category
        if not os.path.exists('data/category'):
            os.makedirs('data/category')

        with open('data/category/' + category + '.csv',
                  'a',
                  newline='',
                  encoding='utf-8-sig') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    delimiter=' ',
                                    quotechar='|',
                                    quoting=csv.QUOTE_MINIMAL)
            if os.path.getsize("data/category/" + category + '.csv') == 0:
                spamwriter.writerow([
                    'product_page_url', 'universal_product_code', 'title',
                    'price_including_tax', 'prince_excluding_tax',
                    'number_available', 'product_description', 'category',
                    'review_rating', 'image_url'
                ])

            spamwriter.writerow([self.infos])

        if not os.path.exists('data/images/' + category):
            os.makedirs('data/images/' + category)

        with open(
                'data/images/' + category + "/" +
                infos[2].translate({ord(i): None
                                    for i in '<>:"“\|?/*'}) + ".jpg",
                "wb") as f:
            a = Extract(infos[9])
            f.write(a.get_url_to_download())
Exemplo n.º 3
0
def etl_fact_macro_details(source_engine, target_engine):
    """fact_macro_details的etl主函数

    从235 tag_detail表etl到240 fact_macro_details表
    :param source_engine: 源数据库引擎
    :param target_engine: 目标数据库引擎
    """
    extract = Extract(source_engine, target_engine)
    transform = Transform()
    load = Load(target_engine)
    record = Record(table='fact_macro_detail', record_path='rec.cfg')

    start_params = record.get_record()
    divisions = extract.std_divisions()

    for i in range(start_params['rounds']):
        start_id = start_params['update_id'] + i * start_params['chunksize'] + 1
        end_id = start_params['update_id'] + (
            i + 1) * start_params['chunksize'] + 1

        tag_details = extract.tag_details(start_id, end_id)
        if len(tag_details) == 0:
            continue
        macro_details = transform.compile_datasets(tag_details, divisions)
        load.loading(macro_details)
        update_id = tag_details['id'].max() if tag_details['id'].max(
        ) else start_params['update_id']
        record.update_record(update_id)
Exemplo n.º 4
0
def getPhrases():
    """
    """
    from extract import Extract

    ex = Extract()
    ex.run()
Exemplo n.º 5
0
def etl_fact_draw_main(engine_source, engine_target,chunksize=5000,record_file='etl_fact_draw.record'):
    """绘图事实表的ETL

    :param engine_source: 源数据库引擎
    :param engine_target: 目标数据库引擎
    """
    extract = Extract(engine_source,chunksize,record_file)
    transform = Transform()
    load = Load(engine_target)
    # 抽取数据
    df_industry,df_draw_gen = extract.extract_main()
    logging.info('Extract datasets completed.')

    for k,df_draw in enumerate(df_draw_gen,1):
        logging.info('Round %d, From obs.%d to obs.%d,start.' % \
                     (k, (k-1)*chunksize, k*chunksize))
        # 清理、转换数据
        df_clean = transform.transform_main(df_industry, df_draw)
        logging.info('Round %d, Data cleaning completed.'%k)

        try:
            load.load_main(df_clean)
            logging.info('Round %d, loading %d obs. Secceed '%(k,len(df_clean)))
            with open(record_file,'w') as f:
                f.write(str(max(df_draw['id'])))
        except Exception as e:
            df_clean[['drawGuid', 'marketGuid']].to_csv('unsecceed_samples.csv', mode='a',index=False)
            logging.error('Round %d,%s' %(k,e))
            raise
def test_extract_to_list_return_np_ndarray():
    image = np.zeros((300, 300, 3))
    extract = Extract()
    list_of_faces = extract.extract_face_to_list(image)

    if type(list_of_faces[0]) is str:
        assert isinstance(list_of_faces[0], str)
    else:
        assert isinstance(list_of_faces[0], np.ndarray)
Exemplo n.º 7
0
	def parseType(self,child,lastNode,isPrivate,nodes):
		if child.tag == 'private_type_declaration' and self.extractPriv:
			return None
		recNode = Extract.getRecordNode(child)
		if recNode is None:
			element = Extract.getType(child,self.sourcefile)
		else:
			element = Extract.getRecord(child)
			self.parseRecursive(recNode,element['components'],child,isPrivate)
		return element
Exemplo n.º 8
0
    def __init__(self, dataSource, dataSet):
        # creating Extract class object here, to fetch data using its generic methods for APIS and CSV data sources
        extractObj = Extract()
        if dataSource == 'api':
            self.data = extractObj.getAPISData(dataSet)
            funcName = dataSource + dataSet

            # getattr function takes in function name of class and calls it.
            getattr(self, funcName)()
        else:
            print('Unknown Data Source!')
Exemplo n.º 9
0
	def __init__(self):
		self.logCPF = None
		self.logNumber = None
		self.extract = Extract()
		Connection.__init__(self)
		sql = 'create table if not exists db_client (CPF integer primary key, Name varchar(45) not null, Surname varchar(45) not null)'
		self.execute(sql)
		self.commit()
		sql = 'create table if not exists db_account (Number integer primary key, Name varchar(45) not null, Surname varchar(45) not null, CPF integer, Balance real not null, Password varchar(45) not null, Limits real not null)'
		self.execute(sql)
		self.commit()
Exemplo n.º 10
0
def etl_dimension_time(target_engine):
    """时间维度表主函数

    :param target_engine: 目标数据库引擎
    """
    extract = Extract()
    transform = Transform()
    load = Load(target_engine)

    full_time = extract.gen_full_time()
    time_table = transform.gen_date(full_time)
    load.loading(time_table)
Exemplo n.º 11
0
    def __init__(self, number, client, balance, password, limit=1000.0):
        '''
		DESCRIPTION:
			Essa classe é utilizada a fim de cadastrar a conta que o cliente vai utilizar no banco.
		'''
        self._number = number
        self._holder = client
        self._balance = balance
        self._limit = limit
        self._password = password
        self._extract = Extract()
        Account._total_accounts += 1
Exemplo n.º 12
0
    def __init__(self,dataSource,dataSet):
        extractObj = Extract()

        if dataSource == 'api':
            self.data = extractObj.getApiData(dataSet)
            funcName =  dataSource + dataSet
            # getattr function takes in function name of class and calls it.
            getattr(self,funcName)()

        elif dataSource == 'csv':
            self.data = extractObj.getCsvData(dataSet)
            funcName = dataSource +dataSet
            getattr(self, funcName)()
Exemplo n.º 13
0
    def get_subreddits_links_to_build_task(self):
        base_ = Base()
        extract_ = Extract()
        list_subreddits_data = base_.get_data_list_subreddits()
        downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
        urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \
            start_date=self.st_dt, end_date=self.end_dt)
        if len(downloaded_subs) > 0:
            urls = list(set(urls)- set(downloaded_subs))
            print("Already Dowloaded {} sub-reddits yet to download {} sub-reddits".format(len(downloaded_subs), len(urls)))
            print("Completed {}%".format(len(downloaded_subs)/len(urls)))

        return urls
Exemplo n.º 14
0
 def act_extract(self):
     if self.filepath_in is None or not os.path.exists(self.filepath_in):
         self.act_open()
     # initialize
     self.combobox.clear()
     self.combobox.addItem('原件')
     self.combobox.addItem(r'提取的文件')
     self.combobox.setCurrentIndex(1)
     #
     extract = Extract(self.filepath_in)
     extract.process(output_filepath=self.filepath_extract)
     #db['project_info'].set_db(extract.extract_project_infos())
     self.refresh_left_preview(self.filepath_extract)
Exemplo n.º 15
0
def etl_demension_division(target_engine):
    """division表的etl主函数

    从统计局爬取的标准csv表中抽取数据,载入到数据仓库
    :param target_engine:目标数据库引擎
    """
    extract = Extract()
    transform = Transform()
    load = Load(target_engine)
    logging.info('Initialize three instances')

    division_datasets = extract.std_divisions()
    std_districts = transform.std_districts(division_datasets)
    load.loading(std_districts)
Exemplo n.º 16
0
    def __init__(self, datasource, dataset):
        self.csv_df = pd.DataFrame()

        # create the Extract object
        extract_obj = Extract()

        if datasource == 'api':
            self.data = extract_obj.get_api_data(dataset)
            func_name = datasource + "_" + dataset
            getattr(self, func_name)()
        elif datasource == 'csv':
            self.data = extract_obj.get_csv_data(dataset)
            func_name = datasource + "_" + dataset
            getattr(self, func_name)()
        else:
            print('Unknown data source!!! Please try again...')
Exemplo n.º 17
0
def main():

    # ticker = 'GME' # Gamestop
    ticker = 'BTC-USD'  # Bitcoin
    # ticker = '^GSPC' # S&P 500 index

    start = datetime.date(2021, 1, 1)  # start looking at the stock
    ext = Extract(ticker=ticker, start=start)
    data = ext.read_data()
    ext.convert_csv(data)

    fst = Forest(ticker=ticker, start=start)
    fst.model_outliers()

    ans = Analysis(ticker=ticker, start=start)
    ans.plot_outliers()
Exemplo n.º 18
0
    def __init__(self, data_source, data_set):

        extract_obj = Extract()
        func_name = f"{data_source}_{data_set}"

        if data_source == 'api':
            self.data = extract_obj.get_api_data(data_set)

        elif data_source == 'csv':
            self.data = extract_obj.get_csv_data(data_set)

        else:
            print('Unkown Data Source')
            return

        # getattr function takes in function name of class and calls it.
        getattr(self, func_name)()
Exemplo n.º 19
0
def etl_fact_market(source_engine, target_engine, rec_path):

    extract = Extract(source_engine, target_engine)
    transform = Transform()
    load = Load(target_engine)
    record = Record('rec.cfg')

    start_params = record.get_record()
    unique_marketguid = []
    done_market = []
    has_dealed = []

    for i, grandParentId in enumerate(unique_marketguid):

        if len(grandParentId) != 36:  # 判断grandParentId的有效性
            logging.error('Round %d, %s is not valid.' % (i, grandParentId))
            continue

        elif grandParentId in done_market:  # 判断该商圈是已经经过etl
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue

        if grandParentId in has_dealed:
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue
        else:
            has_dealed.append(grandParentId)

        zone_grandparent = extract.zone_grandparent(grandParentId)
        if len(zone_grandparent) == 0:
            logging.warning('Round %d, has no draw samples' % i)
            continue

        rent = extract.rent_details(grandParentId)
        industry_tmp = industry[industry['grandParentId'] == grandParentId]
        # 转换数据
        rent = transform.rent_calculate(rent)
        industry_dict = transform.reshape_industry(industry_tmp)
        # 组合数据
        clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict,
                                      zone_grandparent)
        try:
            load.loading(clean)
        except Exception as e:
            logging.error('Round %d, %s' % (i, e))
Exemplo n.º 20
0
 def extract_descriptions(self, widget, device):
     print "extract xml descriptions", widget, device
     from extract import Extract
     id = '@'.join((device.get_usn(), 'DeviceXMlExtract'))
     try:
         self.windows[id].show()
     except:
         ui = Extract(device)
         self.windows[id] = ui.window
Exemplo n.º 21
0
    def __init__(self, dataSource, dataSet):

        # creating Extract class object here, to fetch data using its generic methods for APIS and CSV data sources
        extractObj = Extract()

        if dataSource == 'api':
            self.data = extractObj.getAPIsData(dataSet)
            funcName = dataSource + dataSet

            # getattr function takes in function name of class and calls it.
            getattr(self, funcName)()
        else:
            print('Unkown Data Source!!! Please try again...')

    # Economy Data Transformation

        def apiEconomy(self):
            gdp_india = {}
            for record in self.data['records']:
                gdp = {}

                # taking out yearly GDP value from records
                gdp['GDP_in_rs_cr'] = int(
                    record['gross_domestic_product_in_rs_cr_at_2004_05_prices']
                )
                gdp_india[record['financial_year']] = gdp
                gdp_india_yrs = list(gdp_india)

            for i in range(len(gdp_india_yrs)):
                if i == 0:
                    pass
                else:
                    key = 'GDP_Growth_' + gdp_india_yrs[i]
                    # calculating GDP growth on yearly basis
                    gdp_india[gdp_india_yrs[i]][key] = round(
                        ((gdp_india[gdp_india_yrs[i]]['GDP_in_rs_cr'] -
                          gdp_india[gdp_india_yrs[i - 1]]['GDP_in_rs_cr']) /
                         gdp_india[gdp_india_yrs[i - 1]]['GDP_in_rs_cr']) *
                        100, 2)

        # connection to mongo db,
        mongoDB_obj = MongoDB('GDP')
        # Insert Data into MongoDB
        mongoDB_obj.insert_into_db(gdp_india, 'India_GDP')
Exemplo n.º 22
0
    def read_mail(self, view, attachment=False):
        """Read the latest mail
            @param view
             The view(fold) to access
            @param attachment
             Boolean, whether get attachment
            @return, dict
             Info of a mail
        """
        result = {}

        documents = self.get_documents(view)
        latest_document = documents[-1:][0]
        extra_obj = Extract(latest_document)
        result = extra_obj.extract()
        if attachment:
            extra_obj.get_attachment()

        return result
Exemplo n.º 23
0
	def parseRecursive(self,node,elements,parent=None,isPrivate=False):
		lastNode = None
		
		i = 0
		for child in node:
			element = None
			has_body_declarative_items_ql = False
			if child.tag in ['procedure_body_declaration','function_body_declaration']:
				element = Extract.getFunction(child,lastNode,self.sourcefile)
			elif child.tag in ['function_declaration','generic_function_declaration','procedure_declaration','generic_procedure_declaration','single_task_declaration','task_type_declaration','protected_type_declaration','single_protected_declaration']:
				element = Extract.getFunctionHead(child,lastNode,self.sourcefile,self.extractPriv)
			elif child.tag in ['ordinary_type_declaration','subtype_declaration','private_type_declaration']:
				element = self.parseType(child,lastNode,isPrivate,node)
			elif child.tag in ['component_declaration']:
				element = Extract.getRecordComponent(child,self.sourcefile)
			elif child.tag == 'package_body_declaration':
				element = Extract.getPackage(child,self.prefixClass,lastNode)
			elif child.tag in ['generic_package_declaration','package_declaration']:
				element = self.parsePackage(child,lastNode,isPrivate)
			elif child.tag == 'package_renaming_declaration':
				element = Extract.getRename(child)
			elif child.tag in ['attribute_definition_clause','record_representation_clause','enumeration_representation_clause'] and self.hideRepClause is False:
				element = Extract.getRepClause(child,self.prefixRepClause,self.sourcefile)
			elif child.tag in ['import_pragma']:
				self.imports.append(Extract.getImport(child,self.sourcefile))
			elif child.tag not in ['implementation_defined_pragma']: 
				logging.info("Not parsed: "+child.tag)
				
			if element is not None:
				if parent is None: 
					c = Extract.getUnitComment(self.root)
				else: 
					c = Extract.getComment(node,i) 
				element['comment'] = c
				element['is_private'] = isPrivate
				element['is_extract'] = c != '' or self.extractAll or parent == None
				
				if 'uri' in element: self.elementsByUris[element['uri']] = element
				elements.append(element)
				if element['has_childs'] and child.find('body_declarative_items_ql') is not None:
					isPrivateSubLevel = isPrivate
					if element['type'] == 'function': 
						isPrivateSubLevel = True
					self.parseRecursive(child.find('body_declarative_items_ql'),element['childs'],child,isPrivateSubLevel)
	
			i+=1
			lastNode = child
Exemplo n.º 24
0
def main():
    logger.info("\t EXTRACT")
    # EXTRACT
    extraction = Extract(url, file_name, bucket_name, path)

    # download files
    extraction.download_url()

    # Upload to S3 (mocking)
    s3_obj = S3mocking(file_name, bucket_name)
    s3_obj.load_save()

    # TRANSFORM
    logger.info("\t TRANSFORM")
    pfn = Transform(path, file_name).main()

    # LOAD
    s3_load = S3mocking(pfn, bucket_name)
    logger.info("\t LOAD")
    s3_load.load_save()
Exemplo n.º 25
0
	def parsePackage(self,child,lastNode,isPrivate):
		
		element = Extract.getPackage(child,self.prefixClass,lastNode)
		element['has_childs'] = False
		element['public'] = []
		element['private'] = []
		if child.find('visible_part_declarative_items_ql') is not None:
			self.parseRecursive(child.find('visible_part_declarative_items_ql'),element['public'],child,isPrivate)
		if child.find('private_part_declarative_items_ql') is not None:
			self.parseRecursive(child.find('private_part_declarative_items_ql'),element['private'],child,True)
		return element
Exemplo n.º 26
0
def etl(game, extract_date, data_dir=DATA_DIR, db=load.DB_FILENAME):
    logger.info('Start ETL for game {0}'.format(game))
    load_date = datetime.today()
    data_dir = os.path.join(data_dir, game)
    if game == 'hb':
        trans_fun = Transform.hb_transform
    elif game == 'wwc':
        trans_fun = Transform.wwc_transform

    data = Extract.extract(data_dir, extract_date)
    data = Transform.transform(data, trans_fun, extract_date, load_date)
    Load.load(data, game, db=db)
Exemplo n.º 27
0
 def run_extraction(self):
     extract_ = Extract()
     base_ = Base()
     list_subreddits_data = base_.get_data_list_subreddits()
     downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
     if len(downloaded_subs) > 0:
         remianing_list = list(
             set(list_subreddits_data) - set(downloaded_subs))
         print(
             "Already Dowloaded {} sub-reddits yet to download {} sub-reddits"
             .format(len(downloaded_subs), len(remianing_list)))
         print("Completed {}%".format(
             len(downloaded_subs) / len(list_subreddits_data)))
         list_subreddits_data = remianing_list
     start_time = time.time()
     cost = 0
     for subreddit in list_subreddits_data:
         start_time, cost = extract_.start_extraction(subreddit=subreddit, start_date=self.st_dt, end_date=self.end_dt, \
             base_path=self.sav_path, start_time=start_time, total_cost=cost)
         print(cost)
         print(start_time)
Exemplo n.º 28
0
class BigGiant:
    def __init__(self, pkl_path, model_path):
        self.pkl_path = pkl_path
        self.model_path = model_path
        self.predict = Predict(self.pkl_path, self.model_path)
        self.extract = Extract()
        self.mtcnn = MTCNN()

    def extract_face_to_list(self, image):
        list_of_faces = self.extract.extract_face_to_list(image)
        return list_of_faces

    def predict_face(self, face):
        data = self.predict.predict_face(face)
        return data

    def read_image(self, image_path):
        image = self.extract.read_image(image_path)
        return image

    def aggregate_face_data(self, data, image_url):
        pass
Exemplo n.º 29
0
def etl_fact_market(*args):
    """fact_market表主函数
    
    :param args: 按位参数engine_zone_macro,engine_draw,engine_target
    """
    # 初始化 extract,transform和load三个对象
    extract = Extract(engine_zone_macro, engine_draw, engine_target)
    transform = Transform()
    load = Load(engine_target)

    # 抽取已经经过etl的商圈
    done_market = extract.done_market()
    df_tag_counts = extract.tag_counts()
    df_industry = extract.industry()
    has_dealed = []

    for i, sample_tag_counts in df_tag_counts.iterrows():

        grandParentId = sample_tag_counts['grandParentId']
        if len(grandParentId) != 36:  # 判断grandParentId的有效性
            logging.warning('Round %d, %s is invalid ,skipped.' %
                            (i, grandParentId))
            continue

        elif grandParentId in done_market:  # 判断该商圈是已经经过etl
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue

        if grandParentId in has_dealed:
            logging.warning('Round %d, %s etl before' % (i, grandParentId))
            continue
        else:
            has_dealed.append(grandParentId)

        # 抽取数据
        zone_grandparent = extract.zone_grandparent(grandParentId)
        if len(zone_grandparent) == 0:
            logging.warning('Round %d, has no draw samples' % i)
            continue
        rent = extract.rent_details(grandParentId)
        industry_tmp = df_industry[df_industry['grandParentId'] ==
                                   grandParentId]
        # 转换数据
        rent = transform.rent_calculate(rent)
        industry_dict = transform.reshape_industry(industry_tmp)
        # 组合数据
        clean = transform.compile_dfs(sample_tag_counts, rent, industry_dict,
                                      zone_grandparent)
        try:
            load.loading(clean)
            logging.info('Round %d, %s etl secceed' % (i, grandParentId))
        except Exception as e:
            logging.error('Round %d, %s' % (i, e))
Exemplo n.º 30
0
def market_to_api2(source, target, record_file='api2.record'):
    """anti_fraud数据库api2表的etl主函数
    
    :param source: 源数据库引擎
    :param target: 目标数据库引擎
    :param record_file: 负责记录装载id的文件名,默认为 app2.record
    """
    # 初始化对象
    extract = Extract(source, target, record_file)
    transform = Transform()
    load = Load(target, record_file)

    # 抽取数据
    market_df = extract.market()
    draw_samples = extract.draw_samples()

    # 转换数据
    reshaped_market = transform.reshape_market(market_df)
    aggregated_samples = transform.aggregate_from_samples(draw_samples)
    api2_df = transform.compile_dfs(reshaped_market, aggregated_samples)

    # 装载数据
    load.loading(api2_df)
Exemplo n.º 31
0
def MatchAll(dDir, mDir, appDomains, pathPrefix=None, URL_CLUSTER=False):
    d = Extract("Desktop", dDir, "*.json", appDomains)
    m = Extract("Mobile", mDir, "*.json", appDomains)

    config = {"domains": appDomains, "prefix":pathPrefix, "URL_CLUSTER":URL_CLUSTER}
    ActionRecognizer(d, m).setConfig(config).run()

    if utils.verbose:
        print "*"*80
        print "Labelled Desktop Traces"
        d.printLabeledTraces()
        print "*"*80
        print "Labelled Mobile Traces"
        m.printLabeledTraces()

    if URL_CLUSTER:
        mapping = MatchURLTraces(d,m)
    else:
        mapping = MatchTraces(d, m)