def setUp(self): """ Start Spark, define config and path to test data """ self.config = json.loads("""{"steps_per_floor": 21}""") self.spark, *_ = start_spark(app_name='nlp_clause_test') self.test_data_path = 'tests/test_data/segment_test'
def setUp(self): """Start Spark, define config and path to test data """ self.config = json.loads("""{"Max_Temp_": 21}""") self.spark, *_ = start_spark() self.test_data_path = ( '/Users/LRK/project-folder/GreenFlag/sparkjob/test_data/')
def main(): """Main ETL script definition. """ # start Spark application and get Spark session, logger and config spark, log, config, sc = start_spark( app_name='analysis', files=['configs/etl_config.json']) log.warn('***analysis is up-and-running***') # load data df = load(spark, config["start_date"], config["stop_date"], config["folder"]) log.warn('***data loaded***') # daily tasks if config["daily"]: df_visit_per_hour = visit_per_hour(df, config["stop_date"]) save(df_visit_per_hour, 'out/visit_per_hour', config["stop_date"]) df_visitor_per_hour = visitor_per_hour(df, config["stop_date"]) save(df_visitor_per_hour, 'out/visitor_per_hour', config["stop_date"]) df_referral_path = referral_path(df, sc, config["stop_date"]) save_json(df_referral_path, 'out/referral_path', config["stop_date"]) # monthly tasks if config["monthly"]: df_hourly_visit_pattern = hourly_visit_pattern(df, config["stop_date"]) save(df_hourly_visit_pattern, 'out/hourly_visit_pattern', config["stop_date"]) df_popular_os = popular_os(df, config["stop_date"]) save(df_popular_os, 'out/popular_os', config["stop_date"]) df_popular_browser = popular_browser(df, config["stop_date"]) save(df_popular_browser, 'out/popular_browser', config["stop_date"]) df_country_dist = country_dist(df, config["stop_date"]) save(df_country_dist, 'out/country_dist', config["stop_date"]) df_average_visit_duration = average_visit_duration(df, config["stop_date"]) save(df_average_visit_duration, 'out/average_visit_duration', config["stop_date"]) df_popular_page = popular_page(df, config["stop_date"]) save(df_popular_page, 'out/popular_page', config["stop_date"])
def main(): """Main ETL script definition. :return: None """ # start Spark application and get Spark session, logger and config spark, log, config, environment = start_spark( app_name='my_etl_job', files=['configs/etl_config.json', 'configs/transformation.sql']) # log that main ETL job is starting log.warn('etl_job is up-and-running') # Create ETL Components try: tasks = [ Extract(config['extract']), Transform(config['transform']), Load(config['load']), Impala(config['impala']) ] except KeyError as e: print("Some component missing: " + repr(e)) Executor(spark, log, tasks, environment).run() # log the success and terminate Spark application log.warn('etl_job is finished') spark.stop() return None
def main(): """Main ETL script definition. :return: None """ job_name = sys.argv[1] # start Spark application and get Spark session, logger and config spark, log, config = start_spark(app_name=job_name) # log that main ETL job is starting log.warn('%s is up-and-running' % job_name) # execute ETL pipeline data = extract_data(spark, config['data_source']) #dynamically load transformations from settings data = transform_data(data, config['transformations'], log) #data_transformed = transform_data(data, config['steps_per_floor']) load_data(data, config['data_output']) # log the success and terminate Spark application log.warn('%s is finished' % job_name) spark.stop() return None
def main(): input_path = 'E:\\tmp\\game_csv' output_path = 'E:\\tmp\\output' spark_session, log, config = start_spark( app_name='nlp_tokenization', files=['./configs/file_list_config.json']) data_frame = load_data(spark_session, input_path) writer_csv(transform_data(data_frame), output_path)
def main(begin_date, end_date): """ 品牌序章手百用户分析spark作业 """ spark, log, config = start_spark( app_name="brand_xuzhang_gen_pv_data_%s_%s" % (begin_date, end_date), master='yarn', spark_config= { "spark.yarn.queue": "brand", "spark.shuffle.dce.enable": "true", "spark.executor.memory": "8g", "spark.executor.cores": 1, "spark.executor.instances": 500, "spark.default.parallelism": 1000, "spark.sql.shuffle.partitions": 1000, }) output_path = "/app/ecom/brand/majian06/moirai/gen_pv_data/%s-%s" % (begin_date, end_date) # execute ETL pipeline log.warn('job etl is up-and-running') data = extract_data(spark) data_transformed = transform_data(spark, data) load_data(data_transformed, output_path) # log the success and terminate Spark application log.warn('job etl is finished') spark.stop() return None
def main(): spark, config = start_spark(app_name='my_etl_job', files=['Config/etl_config.json']) path = config['file']['load']['path'] data = extract(spark, path) transform_data = transform(data) load(transform_data)
def main(): """Main analysis script definition. :return: None """ # start Spark application and get Spark session, logger and config spark, log, config = start_spark( app_name='bcg_case_study', files=['configs/case_study_config.json']) # log that main Analysis job is starting log.warn('bcg_case_study_job started running') # Execute config queries primary_person_path = "..\\..\\Data\\Primary_Person_use.csv" # primary_person_path = config['primary_person_csv_path'] primary_person_df = extract(spark, primary_person_path, log) units_path = "..\\..\\Data\\Units_use.csv" # units_path = config['units_csv_path'] units_df = extract(spark, units_path, log) damages_path = "..\\..\\Data\\Damages_use.csv" # damages_path = config['damages_csv_path'] damages_df = extract(spark, damages_path, log) charges_path = "..\\..\\Data\\Charges_use.csv" # charges_path = config['charges_csv_path'] charges_df = extract(spark, charges_path, log) # ANALYSIS - 1 analysis_1(primary_person_df, log) # ANALYSIS - 2 analysis_2(units_df, log) # ANALYSIS - 3 analysis_3(units_df, primary_person_df, log) # ANALYSIS - 4 analysis_4(units_df, log) # ANALYSIS - 5 analysis_5(units_df, primary_person_df, log) # ANALYSIS - 6 analysis_6(units_df, primary_person_df, log) # ANALYSIS - 7 analysis_7(units_df, damages_df, log) # ANALYSIS - 8 analysis_8(units_df, charges_df, primary_person_df, log) # Log the success and terminate Spark application log.warn('bcg_case_study job is finished') spark.stop() return None
def main(): """Main ETL script definition. :return: None """ # start Spark application and get Spark session, logger and config spark, log, config = start_spark(app_name='my_etl_job', files=['configs/etl_config.json']) # log that main ETL job is starting log.warn('etl_job is up-and-running') # execute Today_Load ETL url = 'tests/test_data/energy/NOP_LOAD_FORECAST_20180214_04_input.csv' df_NOP_0214_04 = extract_data_csv(spark, url) groupbyList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"] targetColumn = "NOP" resultColumnName = "TODAY_LOAD" df_NOP_0214_04_GB = groupby_data(df_NOP_0214_04, groupbyList, targetColumn, resultColumnName) #df_NOP_0214_04_GB.show() #execute Prev_Day_Load ETL url = 'tests/test_data/energy/NOP_LOAD_FORECAST_20180213_11_input.csv' df_NOP_0213_11 = extract_data_csv(spark, url) groupbyList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"] targetColumn = "NOP" resultColumnName = "PREV_DAY_LOAD" df_NOP_0213_11_GB = groupby_data(df_NOP_0213_11, groupbyList, targetColumn, resultColumnName) #execute Hour_Load ETL url = 'tests/test_data/energy/LFG_ST_Hourly_20180213_input.csv' df_LFG_0213 = extract_data_csv(spark, url) groupbyList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"] sumList = ["UNADJ_LOAD", "DISTRIB_LOSS_LOAD", "TRANSMISSION_LOSS_LOAD"] resultColumnName = "ADJ_LOAD" df_LFG_0213_GB = groupby_agg_data(df_LFG_0213, groupbyList, sumList, resultColumnName) #Join three DataFrames joinList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"] df_join = join_data(df_NOP_0214_04_GB, df_NOP_0213_11_GB, joinList, 'left') df_join_three = join_data(df_join, df_LFG_0213_GB, joinList, 'left') output = order_data(df_join_three, joinList) #Write output to output.csv load_data(output) # log the success and terminate Spark application log.warn('test_etl_job is finished') spark.stop() return None
def setUp(self): """Start Spark, define config and path to test data """ print(os.getcwd()) print(os.listdir()) self.config = json.loads("""{"steps_per_floor": 21}""") self.spark, *_ = start_spark() self.test_data_path = 'tests/test_data/'
def run_test(): """Running test function :return: None """ # start Spark application and get Spark session, logger and config spark, log, config = start_spark(app_name='my_etl_test_job', files=['configs/etl_config.json']) create_test_data(spark) spark.stop() return None
def main(): spark_session, log, config = start_spark( app_name='nlp_clause', files=['./configs/sentence_spilt_config.json']) # 本地测试用 # input_file = 'E:/tmp/review_csv/output' # output_file = 'E:/tmp/output_review' # data = load_data(spark_session, input_file) # data_transform = transform_data(data) data = load_data(spark_session, config['input_path']) data_transform = transform_data(data) writer_csv(data_transform, config['output_path'])
def setUp(self): """Start Spark, define config and path to test data """ self.config = json.loads("""{ "start_date": "20160801", "stop_date": "20160804", "daily": true, "monthly": true, "folder": "tests/test_data/" }""") self.spark, _, _, self.sc = start_spark() self.test_data_path = self.config["folder"] self.input_data = load(self.spark, self.config["start_date"], self.config["stop_date"], self.test_data_path + "ga/")
def main(): """Main ETL script definition. :return: None """ parser = argparse.ArgumentParser() parser.add_argument('--ftp_user', dest='ftp_user', help='FTP user name') parser.add_argument('--ftp_password', dest='ftp_password', help='FTP Password') parser.add_argument('--api_key', dest='api_key', help='Google Maps API Key') known_args = parser.parse_args() # start Spark application and get Spark session, logger and config spark, log, config = start_spark(app_name='my_etl_job', files=['configs/etl_config.json']) # log that main ETL job is starting log.warn('etl_job is up-and-running') # execute ETL pipeline data = extract_data_module(spark) # extract_data(spark) customer_data = read_from_postgres(spark, "localhost", "golang_user", "go", "customer_price_list") print(customer_data.show()) print(data.show()) data_transformed = transform_data(data, config['steps_per_floor']) load_data(data_transformed) """Start the geocoding portion """ address1 = Row(id='123456', address='14 Maryland Blvd Toronto') address2 = Row(id='789012', address='Suite 2300 100 Wellington St West Toronto') address3 = Row(id='345678', address='10 Bay Street Toronto') address4 = Row(id='901234', address='373 Glebeholme Blvd Toronto') addresses = [address1, address2, address3, address4] address_df = spark.createDataFrame(addresses) geo_enriched_data = address_df.withColumn( "PlaceID", geocode_address_udf(col("address"), lit(known_args.api_key))) print(geo_enriched_data.show()) file_name = get_chd_file(known_args.ftp_user, known_args.ftp_password) print(file_name) # log the success and terminate Spark application log.warn('test_etl_job is finished') spark.stop() return None
def main(): """Main ETL script definition. :return: None """ parser = argparse.ArgumentParser() parser.add_argument('--sf_user', dest='sf_user', help='SF user') parser.add_argument('--sf_password', dest='sf_password', help='SF password') parser.add_argument('--sf_token', dest='sf_token', help='SF token') known_args = parser.parse_args() # start Spark application and get Spark session, logger and config spark, log, config = start_spark(app_name='my_etl_job', files=['configs/etl_config.json']) # log that main ETL job is starting log.warn('etl_job is up-and-running') sf_user = known_args.sf_user sf_password = known_args.sf_password salesforce = Salesforce(username=sf_user, password=sf_password, security_token='') # query = "select id, name, annual_sales__c,average_check__c, chain_id__c, chain_name__c, chd_id__c, confidence_level__c, county__c,credit_rating__c,dayparts__c ,dma_name__c ,group_health_system__c ,hours__c ,location_name__c ,menu_items__c ,msa_name__c ,number_of_employees__c ,number_of_rooms__c ,number_of_seats__c ,operation_type__c ,parent_id__c,phone ,units__c ,website,years_in_business__c,yelp_url__c,chd_chain_id__c,Google_Place_ID__c,Qualification_Status__c,current_month_match__c,CustomerStatus__c, ShippingCity,ShippingLatitude,ShippingLongitude,ShippingPostalCode,ShippingState,ShippingStreet,market_segment_list__c,Current_Chd_Name__c, Data_Update_Case__c, exclude_from_chd_match__c, Current_Chd_Shipping_Street__c, Current_Chd_Shipping_City__c, Current_Chd_Shipping_State__c,Current_Chd_Shipping_Postal_Code__c from Account" query = "SELECT id, chd_id__c, Google_Place_ID__c from Account" #Google_Place_ID__c # accounts_spark = get_sf_df(query, salesforce, spark) query_to_geocode = "SELECT id, Name, ShippingAddress from Account where chd_id__c = null AND google_place_id__c = null" accounts_to_geocode_list = salesforce.query_all(query_to_geocode) accounts_to_geocode_records = accounts_to_geocode_list['records'] accounts_to_geocode_pdf = pd.DataFrame(accounts_to_geocode_records) # accounts_to_geocode_pdf = convert_simple_salesforce_ordered_dictionary_to_pandas_dataframe(accounts_to_geocode_records) accounts_to_geocode_pdf['parsed_address'] = accounts_to_geocode_pdf[ 'ShippingAddress'].apply(lambda x: json.dumps(x)) accounts_to_geocode_pdf = accounts_to_geocode_pdf.drop( ["attributes", "ShippingAddress"], axis=1) accounts_to_geocode_spark = spark.createDataFrame(accounts_to_geocode_pdf) accounts_to_geocode_spark.printSchema() print(accounts_to_geocode_spark.count()) accounts_to_geocode_spark.write.parquet('tests/chd/sf_accounts_to_geocode', mode='overwrite') accounts = salesforce.query_all(query) accounts_pandas = pd.DataFrame(accounts['records']) accounts_spark = spark.createDataFrame(accounts_pandas) accounts_spark.printSchema() accounts_spark.write.parquet('tests/chd/sf_accounts', mode='overwrite')
def main(): # gameid = ['69698', '5151', '60187', '47330', '54928', '10497', '12492', '55307', '2301', '70056', '50500', '74870', # '34768', '35141', '91972', '6922', '69383', '85118', '85452', '69411', '85552', '31074', '69405', '70215', # '59520', '66187', '10056', '85846', '33973', '71417'] # input_path = 'E:\\tmp\\review_csv' # input_path = 'E:\\tmp\\csv_test' # output_path = 'E:\\tmp\\output_review' spark_session, log, config = start_spark(app_name='nlp_tokenization', files=['./configs/file_list_config.json']) gameid = ('69698', '5151', '60187', '47330', '54928', '10497', '12492', '55307', '2301', '70056', '50500', '74870', '34768', '35141', '91972', '6922', '69383', '85118', '85452', '69411', '62422', '31074', '69405', '70215', '59520', '66187', '10056', '85846', '33973', '71417') data_frame = load_data(spark_session, config[''], str(gameid)) writer_csv(data_frame, config[''], id.strip())
def main(): spark, log, config = start_spark(app_name='my_spark_app', files=[ 'configs/etl_config.json']) log.warn('etl job is up and running') # execute ETL pipeline data = extract_data(spark) data_transformed = transform_data(data, config['steps_per_floor_']) load_data(data_transformed) log.warn('test etl job finished') spark.stop() return None
def main(): """ :return: """ user_dict = 'user_dict.txt' user_dict_path = resolve_filename(get_module_res(user_dict)) print(get_module_res(user_dict).name) jieba.load_userdict(user_dict_path) stop_path = resolve_filename(get_module_res('stop_word.txt')) spark_session, log, config = start_spark( app_name='nlp_tokenization', files=['configs/file_list_config.json']) stop_words = stop_word(spark_session, stop_path) # input_path=config['file_input'], out_put=config['file_output'] participle(session=spark_session, stop_words=stop_words)
def main(): spark, sc = start_spark(app_name="PySpark - AMRDEF", config='localhost') # job-translate-amrdef data = translator( "file:////home/ivan/Documents/Primestone/Esquemas/AMRDEF_sample_modif_flags_actualdates.xml", spark) print("\n" * 10, "data translation done", "\n" * 10) # job-enrich data = enricher(data, spark) print("\n" * 10, "data enrichment done", "\n" * 10) # job-clean data = cleaner(data, spark) print("\n" * 10, "data cleaning done", "\n" * 10)
def main(): """Main ETL script definition. :return: None """ # start Spark application and get Spark session, logger, config and audit spark, logger, config_dict, audit = start_spark( app_name='PHM_alinity_i_205_results' ) # log that main ETL job is starting logger.info('etl_job is up-and-running') # execute ETL pipeline data = extract_data(spark, config_dict) data_transformed = transform_data(data) partition_list = data_transformed.agg(collect_set('transaction_date')).collect()[0][0] logger.debug("Patitions to dedup: ") logger.debug(str(partition_list)) cleansed_bucket = config_dict['etl_cleansed_bucket'] cleansed_key_prefix = config_dict['etl_cleansed_key'] + "/transaction_date=" s3_utils = S3Utils(spark, config_dict) paths_list = [] for partition_suffix in partition_list: if s3_utils.is_key_prefix_empty(cleansed_bucket, cleansed_key_prefix + str(partition_suffix)): paths_list.extend(config_dict['s3a_prefix'] + cleansed_bucket + cleansed_key_prefix + str(partition_suffix) + "/*") logger.debug("Patitions to dedup: " + str(paths_list)) if paths_list: data_cleansed = spark.read.format("parquet").load(paths_list) data_deduped = deduplicate_data(data_transformed, data_cleansed) load_data(data_deduped, config_dict['s3a_prefix'] + config_dict['etl_cleansed_bucket'] + "/" + config_dict['etl_cleansed_key']) else: load_data(data_transformed, config_dict['s3a_prefix'] + config_dict['etl_cleansed_bucket'] + "/" + config_dict['etl_cleansed_key']) # log the success and terminate Spark application logger.info('test_etl_job is finished') stop_spark(spark, config_dict, audit) return None
def main(): spark, sql_context, log, config = start_spark( app_name='radiography_analysis', files=['configs/radiography_analysis_config.json']) log.warn('Running radiography analysis...') # extracting and transforming the dataset [data_normal, data_covid19, data_lung_opacity, data_viral_pneumonia] = extract_data(spark) data_initial = transform_data(data_normal, data_covid19, data_lung_opacity, data_viral_pneumonia, sql_context) # percentage of samples (different categories) data_transformed = transform_percentage_of_samples(data_initial) load_data(data_transformed, "percentage_of_samples") # take one sample of each group data_transformed = transform_take_samples(data_initial) load_data(data_transformed, "take_samples") # colour distribution data_transformed = transform_colour_distribution(data_initial) load_data(data_transformed, "colour_distribution") # ML classification (distributed) data_transformed = transform_ml_classification(data_initial, spark) load_data(data_transformed, "ml_classification") # The trained model is available in -> /keras_model # DL model compiling/training (not distributed) # [data_transformed_matrix, data_transformed_acc] = transform_dl_classification(data_initial, spark) # load_data(data_transformed_matrix, "dl_classification_matrix") # load_data(data_transformed_acc, "dl_classification_accuracy") # DL model inference (distributed) data_transformed = transform_dl_model_inference(data_initial) load_data(data_transformed, "dl_inference") log.warn('Terminating radiography analysis...') spark.stop() return None
def setUp(self): """Start Spark, define config and path to test data """ self.spark, self.log, *_ = start_spark(app_name='unittest_etl_job') self.config = json.loads("""{ "extract" : {"uri": "tests/test_data/udf_test_data/recipes_negative.json", "clean": "True"}, "transform": {"udfs_required":["tominutes"], "ingredient": "beef", "ingredient": 30, "ingredient": 60, "ingredient": 60 }, "load" : { "load_path": "output/report.csv" } } """)
def setUp(self): """Start Spark, define config and path to test data """ self.spark, *_ = start_spark(app_name='my_etl_job') self.config = json.loads("""{ "extract" : {"uri": "tests/test_data/udf_test_data/recipes_positive.json", "clean": "True", "temptable": "recipes"}, "transform": {"sql_path": "configs/transformation.sql", "udfs_required":["tominutes"]}, "load" : {"database": "hellofresh", "tablename": "recipes", "load_path": "user/hive/warehouse/hellofresh.db/recipes", "partition_cols": {"difficulty": "string"} }, "impala" : {"impala_host": "localhost"} } """)
def main(): """Main ETL script definition. :return: None """ # start Spark application and get Spark session, logger and config spark, log, config = start_spark( app_name='my_etl_job', files=['configs/etl_config.json']) # log that main ETL job is starting log.warn('etl_job is up-and-running') # execute ETL pipeline data = extract_data(spark) data_transformed = transform_data(data, config['steps_per_floor']) load_data(data_transformed) # log the success and terminate Spark application log.warn('test_etl_job is finished') spark.stop() return None
def main(): """ Main ETL script definition. :return: None """ platform = sys.argv[1] if len(sys.argv) > 1 else "local" if platform not in ["local", "emr"]: platform = "local" config_path = "./configs/etl_config.json" # start Spark application and get Spark session, logger and config spark, log, config = start_spark(app_name="spark-app", files=[config_path]) # log that main ETL job is starting log.warn("spark-app is up-and-running") if platform == "local": spark.sparkContext.addPyFile("jobs/common.py") spark.conf.set("spark.sql.crossJoin.enabled", "true") # read config config = config[platform] # execute ETL pipeline # extract data_frames = extract_data(spark, log, config) # transform data_frames = transform_data(spark, log, config, data_frames) # load load_data(spark, log, config, data_frames) # log the success and terminate Spark application spark.stop() return None
def main(): """ :return: """ pattern = u'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+' stop_words_file = './dependencies/stop_word.txt' user_dict = './dependencies/user_dict.txt' stop_words = stop_word(stop_words_file) file_path = 'E:\\workspaces_learn\\taptap-spider' spark_session, log, config = start_spark(app_name='nlp_tokenization', files=['./configs/file_list_config.json']) file_list = os.listdir(file_path) for file in file_list: if file.startswith('app_review'): if file.endswith('.csv'): file_split = file.split('.')[0] file = file_path + '/' + file sentences_list = load_data(spark_session, file) word_split(sentences_list=sentences_list, stop_words=stop_words, user_dict=user_dict, pattern=pattern, session=spark_session, file=file_split)
def main(): """Main ETL script definition. :return: None """ # start Spark application and get Spark session, logger and config spark, log, config = start_spark( app_name='process_text_job', files=['configs/process_text_config.json']) # log that main ETL job is starting log.warn('process_text_job is up-and-running') # execute ETL pipeline data = extract_data(spark, config['input_path']) data_transformed = transform_data(data) load_data(data_transformed, config['output_path']) # log the success and terminate Spark application log.info('process_text_job is finished') spark.stop() return None
def main(): """Main ETL script definition. :return: None """ # start Spark application and get Spark session, logger and config spark, log, config = start_spark(app_name='dpl_ecf', files=['configs/etl_config.json']) # log that main ETL job is starting log.warn('dpl_ecf is up-and-running') # execute ETL pipeline data_frames = extract_data(spark, config) data_frames_drop, data_frames_transpose = transform_data( data_frames, config) load_data(data_frames_drop, data_frames_transpose, config) # log the success and terminate Spark application log.warn('dpl_ecf is finished') spark.stop() return None
def __init__(self, app_name, files, jar_packages, dependencies = "packages.zip"): # start Spark application and get Spark session, logger and config self.app_name = app_name spark, log, config = start_spark( app_name=app_name, files=[files], master="192.168.122.3:7077", jar_packages=jar_packages, dependencies=dependencies ) self.kafka_server = config["kafka_server"] self.es_server = config["es_server"] self.log = log self.spark = spark self.config = config self.es_reader = (spark.read .format("org.elasticsearch.spark.sql") .option("es.nodes", self.es_server) .option("es.net.http.auth.user","elastic") .option("es.read.field.as.array.include", "winlog.keywords,etl_pipeline") .option("es.read.field.exclude", "tags,user,z_original_message,z_logstash_pipeline") )