def compile_translate(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) from_str = op.from_str.op().value to_str = op.to_str.op().value return F.translate(src_column, from_str, to_str)
def write(tsdf, spark, tabName, optimizationCols = None): """ param: tsdf: input TSDF object to write param: tabName Delta output table name param: optimizationCols list of columns to optimize on (time) """ # hilbert curves more evenly distribute performance for querying multiple columns for Delta tables spark.conf.set("spark.databricks.io.skipping.mdc.curve", "hilbert") df = tsdf.df ts_col = tsdf.ts_col partitionCols = tsdf.partitionCols if optimizationCols: optimizationCols = optimizationCols + ['event_time'] else: optimizationCols = ['event_time'] import os useDeltaOpt = (os.getenv('DATABRICKS_RUNTIME_VERSION') != None) view_df = df.withColumn("event_dt", f.to_date(f.col(ts_col))) \ .withColumn("event_time", f.translate(f.split(f.col(ts_col).cast("string"), ' ')[1], ':', '').cast("double")) view_cols = deque(view_df.columns) view_cols.rotate(1) view_df = view_df.select(*list(view_cols)) view_df.write.mode("overwrite").partitionBy("event_dt").format('delta').saveAsTable(tabName) if useDeltaOpt: try: spark.sql("optimize {} zorder by {}".format(tabName, "(" + ",".join(partitionCols + optimizationCols) + ")")) except Exception as e: print("Delta optimizations attempted, but was not successful.\nError: {}".format(e)) else: print("Delta optimizations attempted on a non-Databricks platform. Switch to use Databricks Runtime to get optimization advantages.")
def remove_illegal_chars(self, dataframe: DataFrame, source_column: str, target_column: str): df2 = dataframe.select( col('id'), translate(col(source_column), f'["".join({self.chars})]', self.replacament).alias(target_column)) return df2.select('id', 'string_filtered')
def undersores_to_spaces(col): """ Replace undersores with spaces :param col: Union[str, Column] A column or a name of a column """ return F.translate(col, "_", " ")
def read_and_clean_json(spark, input_json_path): ''' Reads json file with products data Explodes nested columns Selects main columns to dataframe Replaces wrong values of character '&'(ampersand) Replaces values of character '$' to cast price column to double Replaces wrong null values to None Casts price column to double ''' df = spark.read.json(input_json_path) df = df.withColumn("tmp", arrays_zip("category", "description", "image")) \ .withColumn("tmp", explode("tmp")) \ .select("asin", col("tmp.category"), col("tmp.description"), col("tmp.image"), "title", "brand", "main_cat", "price") \ .withColumn('brand', translate('brand', '&', '&')) \ .withColumn('category', translate('category', '&', '&')) \ .withColumn('main_cat', translate('main_cat', '&', '&')) \ .withColumn('price', regexp_replace('price', '\$', '')) \ .replace(['null', '', 'None'], None) \ .withColumn('price', col('price').cast("double")) return df
def write(tsdf, spark, tabName, optimizationCols=None): """ param: tsdf: input TSDF object to write param: tabName Delta output table name param: optimizationCols list of columns to optimize on (time) """ # hilbert curves more evenly distribute performance for querying multiple columns for Delta tables spark.conf.set("spark.databricks.io.skipping.mdc.curve", "hilbert") df = tsdf.df ts_col = tsdf.ts_col partitionCols = tsdf.partitionCols if optimizationCols: optimizationCols = optimizationCols + ['event_time'] else: optimizationCols = ['event_time'] useDeltaOpt = True try: dbutils.fs.ls("/") except: print('Running in local mode') useDeltaOpt = False pass view_df = df.withColumn("event_dt", f.to_date(f.col(ts_col))) \ .withColumn("event_time", f.translate(f.split(f.col(ts_col).cast("string"), ' ')[1], ':', '').cast("double")) view_df.write.mode("overwrite").partitionBy("event_dt").format( 'delta').saveAsTable(tabName) if useDeltaOpt: try: spark.sql("optimize {} zorder by {}".format( tabName, "(" + ",".join(partitionCols + optimizationCols) + ")")) except: print( "Delta optimizations attempted on a non-Databricks platform. Switch to use Databricks Runtime to get optimization advantages." )
def clean(self): catalog_df = self.spark.read.csv(self.source_path, inferSchema=True, header=True, mode="DROPMALFORMED") non_duplicated_content = catalog_df.dropDuplicates( ['title', 'director']).orderBy(fc.desc('title')) df_netflix_catalog = non_duplicated_content.dropna( 'any', subset=['title', 'director']).orderBy(fc.asc('title')) df_netflix_catalog = df_netflix_catalog.withColumn( 'title', fc.translate('title', '"', '')) df_netflix_catalog = df_netflix_catalog.withColumn( 'show_id', col('show_id').cast(tp.LongType())) df_netflix_catalog = df_netflix_catalog.withColumn( 'release_year', col('release_year').cast(tp.IntegerType())) df_netflix_catalog = df_netflix_catalog.withColumn( 'date_added', fc.to_date('date_added', 'MMMMM dd, yyyy')) df_netflix_catalog.write.partitionBy(['title', 'director']).bucketBy( 2, "release_year").parquet(self.destination_path, mode='overwrite') print("Clean Catalog Executed")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # ETL TBHV ## Phonetic dyf_phonemic = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="phonemic") dyf_phonemic = dyf_phonemic.select_fields(['id', 'phonemic']) df1 = dyf_phonemic.toDF() df1.cache() df1 = df1.select('phonemic') # myArr = np.array(df1.select('phonemic').collect()) arrPhonetic = [row.phonemic for row in df1.collect()] # print('ARR:', arrPhonetic) # print('ARR1 :', (u'i:' in arrPhonetic)) # Custom function def doSplitWord(word): rs = [] if word is not None: i = 0 size = len(word) while i < size: s = word[i:i + 2] i += 2 if s in arrPhonetic: rs.append(s) if s not in arrPhonetic: i -= 2 s = word[i:i + 1] i += 1 if s in arrPhonetic: rs.append(s) return rs # print('test:', doSplitWord('abcacd')) splitWord = udf(lambda x: doSplitWord(x)) knowledge = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'], ['P02', 'Basic'], ['P03', 'sbasic'], ['P03', 'basic'], ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None], ['L02', None], ['L03', None], ['L04', None], ['L05', None], [None, 'DICTATION'], [None, 'LISTENING']] comprehension = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'], ['P02', 'basic'], ['P03', None], ['P03', 'basic'], ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None], ['L02', None], ['L03', None], ['L04', None], ['L05', None], [None, 'DICTATION'], [None, 'LISTENING']] application = [['L04', None], ['L04', None], ['L05', None], [None, 'LISTENING']] analysis = [] synthesis = [] evaluation = [] state_gradedright = 'gradedright' def doAddScore(name, parName, state, type): arr = [] score = 0 if type == 'knowledge': arr = knowledge if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis if type == 'evaluation': arr = evaluation if state == state_gradedright: score = 2 if state != state_gradedright: score = -1 for x in arr: if x[0] is None and x[1] == parName: return score if x[0] == name and x[1] is None: return score if x[0] == name and x[1] is not None and x[1].lower( ) in parName.lower(): return score return 0 addScore = udf(doAddScore, IntegerType()) # print('CHECK:', checkContains('ABCD EFHFF')) # chuoi ky tu can replace special_str = '["].' ######### top_question_attempt_steps dyf_top_question_attempt_steps = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempt_steps_092019") dyf_top_question_attempt_steps = dyf_top_question_attempt_steps.select_fields( ['_key', 'id', 'questionattemptid', 'state', 'userid']).rename_field('id', 'steps_id') try: # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) start_read = 22000000 end_read = 24000000 # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_top_question_attempt_steps = Filter.apply( frame=dyf_top_question_attempt_steps, f=lambda x: x['_key'] >= start_read and x['_key'] < end_read) except: print('read flag file error ') df_temp = dyf_top_question_attempt_steps.toDF() df_temp.cache() print('COUNT df_temp:', df_temp.count()) dyf_top_question_attempt_steps = DynamicFrame.fromDF( df_temp, glueContext, "dyf_right") # print('number of dyf_top_question_attempt_steps: ', dyf_top_question_attempt_steps.count()) if dyf_top_question_attempt_steps.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user") dyf_top_user = dyf_top_user.select_fields(['id', 'student_id']).rename_field( 'id', 'top_user_id') ########## top_question_attempts dyf_top_question_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempts_092019") dyf_top_question_attempts = dyf_top_question_attempts.select_fields( ['id', 'rightanswer', 'questionid', 'timemodified']) # dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question") dyf_top_question = dyf_top_question.select_fields( ['id', 'name', 'category']).rename_field('id', 'quest_id') # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_question_categories dyf_top_question_categories = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_categories") dyf_top_question_categories = dyf_top_question_categories.select_fields( ['id', 'name', 'parent']).rename_field('id', 'quest_cat_id') ######### dyf_top_question_categories_parent dyf_top_question_categories_parent = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_categories") dyf_top_question_categories_parent = dyf_top_question_categories_parent.select_fields( ['id', 'name']).rename_field('id', 'par_id').rename_field('name', 'par_name') # print("COUNT dyf_top_question_attempts:", dyf_top_question_attempts.count()) # print("COUNT dyf_top_question:", dyf_top_question.count()) # print("COUNT dyf_top_question_attempt_steps:", dyf_top_question_attempt_steps.count()) # print("COUNT dyf_top_question_categories:", dyf_top_question_categories.count()) # dyf_top_question_attempt_steps = Filter.apply(frame=dyf_top_question_attempt_steps, f=lambda x: x["steps_id"]) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_question_attempt_steps, dyf_top_question_attempts, 'questionattemptid', 'id') # print("COUNT 1:", dyf_join01.count()) # dyf_join01.printSchema() dyf_join02 = Join.apply(dyf_join01, dyf_top_question, 'questionid', 'quest_id') # print("COUNT 2:", dyf_join02.count()) # dyf_join02.printSchema() dyf_join03 = Join.apply(dyf_join02, dyf_top_question_categories, 'category', 'quest_cat_id') dyf_join03 = Join.apply(dyf_join03, dyf_top_question_categories_parent, 'parent', 'par_id') # print("COUNT dyf_join03 1:", dyf_join03.count()) # print("COUNT dyf_top_user:"******"COUNT dyf_join03 2:", dyf_join03.count()) # dyf_join03.printSchema() dyf_join03 = dyf_join03.select_fields([ 'student_id', 'rightanswer', 'timemodified', 'state', 'name', 'parent', 'par_name' ]) arrName = [ 'V01', 'V02', 'V03', 'V04', 'V05', 'G01', 'G02', 'G03', 'G04', 'G05', 'P01', 'P02', 'P03', 'P04', 'P05' ] arrParName = ['CONVERSATIONAL_EXPRESSION', 'VOCABULARY', 'READING'] dyf_join03 = Filter.apply( frame=dyf_join03, f=lambda x: x["name"] in arrName or x["par_name"] in arrParName) # dyf_join03.printSchema() # dyf_join03.show() # dyf_right = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] == state_gradedright) # dyf_wrong = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] != state_gradedright) # dyf_join02.show() df_right = dyf_join03.toDF() # df_right.cache() if (df_right.count() > 0): try: # print("COUNT 1:", df_right.count()) # Loc cac ky tu dac biet [ ] ", # Tach cau thanh array tu: # house, her => [house, her] df_right = df_right.withColumn( "right_str", f.translate(df_right.rightanswer, special_str, '')) df_right = df_right.withColumn( "right_arr", f.split(df_right.right_str, ' ')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_right = df_right.withColumn("right", f.explode(df_right.right_arr)) # print("COUNT 2:", df_right.count()) df_right.printSchema() dyf_right = DynamicFrame.fromDF(df_right, glueContext, "dyf_right") ## Learning Object dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="nvn_knowledge_learning_object") dyf_learning_object = dyf_learning_object.select_fields([ 'learning_object_id', 'learning_object_name', 'phone_tic' ]) df_learning_object = dyf_learning_object.toDF() # replace cac ky tu df_learning_object = df_learning_object.withColumn( "phone_tic_new", f.translate(df_learning_object.phone_tic, '\',', '')) df_learning_object = df_learning_object.withColumn( "phone_tic_tmp", splitWord(df_learning_object.phone_tic_new)) df_learning_object = df_learning_object.withColumn( "phone_tic_tmp_01", f.translate(df_learning_object.phone_tic_tmp, '[]', '')) df_learning_object = df_learning_object.withColumn( "phone_tic_arr", f.split(df_learning_object.phone_tic_tmp_01, ',')) df_learning_object = df_learning_object.select( 'learning_object_id', 'learning_object_name', 'phone_tic_arr') dyf_learning_object = DynamicFrame.fromDF( df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_right, dyf_learning_object, 'right', 'learning_object_name') dyf_knowledge_right = dyf_knowledge_right.select_fields([ 'student_id', 'learning_object_id', 'name', 'parent', 'timemodified', 'par_name', 'state', 'phone_tic_arr' ]) # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # dyf_knowledge_right.show() # # print("COUNT 4:", dyf_knowledge_wrong.count()) # # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_right = dyf_knowledge_right.toDF() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn( "right_phonetic", f.explode(df_knowledge_right.phone_tic_arr)) df_knowledge_right = df_knowledge_right.select( 'student_id', 'name', 'timemodified', 'par_name', 'state', 'right_phonetic') dyf_study_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_study_right") dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'right_phonetic', 'phonemic') df_knowledge_right = dyf_phonemic_right.toDF() df_knowledge_right = df_knowledge_right.withColumn("knowledge", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit("knowledge"))) \ .withColumn("comprehension", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('synthesis'))) \ .withColumn("evaluation", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('evaluation'))) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timemodified'], 'yyyyMMdd')) \ .withColumn("lo_type", f.lit(2)) # df_knowledge_right.printSchema() # df_knowledge_right.show() dyf_knowledge_right = DynamicFrame.fromDF( df_knowledge_right, glueContext, "dyf_knowledge_right") dyf_knowledge_right = dyf_knowledge_right.resolveChoice( specs=[('lo_type', 'cast:byte')]) # df_knowledge_right = dyf_knowledge_right.toDF() # chon cac truong va kieu du lieu day vao db applymapping = ApplyMapping.apply( frame=dyf_knowledge_right, mappings=[("timemodified", "long", "timestart", "long"), ("name", "string", "name", "string"), ("par_name", "string", "par_name", "string"), ("student_id", 'int', 'student_id', 'long'), ("id", "int", "learning_object_id", "int"), ("date_id", "string", "date_id", "long"), ("knowledge", "int", "knowledge", "long"), ("comprehension", "int", "comprehension", "long"), ("application", "int", "application", "long"), ("analysis", "int", "analysis", "long"), ("synthesis", "int", "synthesis", "long"), ("evaluation", "int", "evaluation", "long"), ("phone_tic", "string", "phone_tic", "long"), ("lo_type", "byte", "lo_type", "int")]) resolvechoice = ResolveChoice.apply( frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "t_temp_right_learning_object_02", "database": "dts_odin", "postactions": """ call proc_knowledge_ngu_am_top_question_attempts () """ }, redshift_tmp_dir="s3n://dts-odin/temp1/", transformation_ctx="datasink5") # xoa cache # df_right.unpersist() # df_knowledge_right.unpersist() # df_knowledge_right.unpersist() # lay max _key tren datasource df_temp = dyf_top_question_attempt_steps.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag moi vao s3 df.write.parquet( "s3a://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts", mode="overwrite") except Exception as e: print( "###################### Exception ##########################" ) print(e)
def tr(self, m, r): f = lambda c: F.translate(c, m, r) return self.apply(f)
def process_biomarkers( self, biomarkers_df: DataFrame, source_df: DataFrame, disease_df: DataFrame, drugs_df: DataFrame ) -> DataFrame: """The diverse steps to prepare and enrich the input table""" biomarkers_enriched = ( biomarkers_df .select( 'Biomarker', 'IndividualMutation', array_distinct(split(col('Alteration'), ';')).alias('alterations'), array_distinct(split(col('Gene'), ';')).alias('gene'), split(col('AlterationType'), ';').alias('alteration_types'), array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'), array_distinct(split(col('Drug'), ';|,')).alias('drug'), 'DrugFullName', 'Association', 'gDNA', array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'), array_distinct(split(col('Source'), ';')).alias('source') ) .withColumn('confidence', explode(col('confidence'))) .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name'))) .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', '')) .withColumn('drug', explode(col('drug'))) .withColumn('drug', translate(col('drug'), '[]', '')) .withColumn('gene', explode(col('gene'))) .replace(to_replace=GENENAMESOVERRIDE, subset=['gene']) .withColumn('gene', upper(col('gene'))) # At this stage alterations and alteration_types are both arrays # Disambiguation when the biomarker consists of multiple alterations is needed # This is solved by: # 1. Zipping both fields - tmp consists of a list of alteration/type tuples # 2. tmp is exploded - tmp consists of the alteration/type tuple # 3. alteration & alteration_type columns are overwritten with the elements in the tuple .withColumn( 'tmp', self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types'))) .withColumn('tmp', explode(col('tmp'))) .withColumn('alteration_type', element_at(col('tmp'), 2)) .withColumn( 'alteration', when( ~col('IndividualMutation').isNull(), col('IndividualMutation') ) .otherwise(element_at(col('tmp'), 1)) ) .drop('tmp') # Clean special cases on the alteration string .withColumn( 'alteration', when( col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.', col('Biomarker') # 'NRAS (12,13,59,61,117,146)' ) .when( # Cleans strings like 'ARAF:.' col('alteration').contains(':.'), translate(col('alteration'), ':.', '') ) .when( # Fusion genes are described with '__' # biomarker is a cleaner representation when there's one alteration (col('alteration').contains('__')) & (~col('Biomarker').contains('+')), col('Biomarker') ) .otherwise(col('alteration')) ) # Split source into literature and urls # literature contains PMIDs # urls are enriched from the source table if not a CT .withColumn('source', explode(col('source'))) .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source'))) .join(source_df, on='source', how='left') .withColumn( 'literature', when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2)) ) .withColumn( 'urls', when( col('source').startswith('NCT'), struct( lit('Clinical Trials').alias('niceName'), concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url') ) ) .when( (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')), struct(col('niceName'), col('url')) ) ) # The previous conditional clause creates a struct regardless of # whether any condition is met. The empty struct is replaced with null .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls'))) # Enrich data .withColumn('functionalConsequenceId', col('alteration_type')) .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId']) .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association']) .join(disease_df, on='tumor_type', how='left') .withColumn('drug', upper(col('drug'))) .withColumn( # drug class is coalesced when the precise name of the medicine is not provided 'drug', when(col('drug') == '', col('DrugFullName')).otherwise(col('drug'))) .join(drugs_df, on='drug', how='left') .withColumn('drug', initcap(col('drug'))) # Translate variantId .withColumn( 'variantId', when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA'))) ) # Assign a GO ID when a gene expression data is reported .withColumn( 'geneExpressionId', when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')), 'GO_0010628' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')), 'GO_0010629' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')), 'GO_0010467' ) ) # Create variant struct .withColumn( 'variant', when( col('alteration_type') != 'EXPR', struct( col('alteration').alias('name'), col('variantId').alias('id'), col('functionalConsequenceId') ) ) ) # Create geneExpression struct .withColumn( 'geneExpression', when( col('alteration_type') == 'EXPR', struct( col('alteration').alias('name'), col('geneExpressionId').alias('id')) ) ) ) pre_evidence = ( biomarkers_enriched .withColumn('datasourceId', lit('cancer_biomarkers')) .withColumn('datatypeId', lit('affected_pathway')) .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource') .withColumnRenamed('drug', 'drugFromSource') # diseaseFromSourceMappedId, drugId populated above .withColumnRenamed('Association', 'drugResponse') # confidence, literature and urls populated above .withColumnRenamed('gene', 'targetFromSourceId') .withColumnRenamed('Biomarker', 'biomarkerName') # variant, geneExpression populated above .drop( 'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId', 'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url') ) # Group evidence self.evidence = ( pre_evidence .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId', 'drugResponse', 'targetFromSourceId', 'diseaseFromSource', 'diseaseFromSourceMappedId', 'confidence', 'biomarkerName') .agg( collect_set('literature').alias('literature'), collect_set('urls').alias('urls'), collect_set('variant').alias('variant'), collect_set('geneExpression').alias('geneExpression'), ) # Replace empty lists with null values .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature'))) .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls'))) .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant'))) .withColumn( 'geneExpression', when(size(col('geneExpression')) == 0, lit(None)) .otherwise(col('geneExpression'))) # Collect variant info into biomarkers struct .withColumn( 'biomarkers', struct( 'variant', 'geneExpression' )) .drop('variant', 'geneExpression') .distinct() ) return self.evidence
# MAGIC # MAGIC Spark performs predictive analytics using machine learning algorithms. # MAGIC # MAGIC The example below trains a linear regression model using past flight data to predict delays based on the hour of the day. # COMMAND ---------- from pyspark.sql.functions import col, floor, translate, round from pyspark.ml import Pipeline from pyspark.ml.feature import VectorAssembler, OneHotEncoder from pyspark.ml.regression import LinearRegression inputDF = (spark.read.table("AirlineFlight").withColumn( "HourOfDay", floor(col("CRSDepTime") / 100)).withColumn( "DepDelay", translate(col("DepDelay"), "NA", "0").cast("integer"))) (trainingDF, testDF) = inputDF.randomSplit([0.80, 0.20], seed=999) pipeline = Pipeline(stages=[ OneHotEncoder(inputCol="HourOfDay", outputCol="HourVector"), VectorAssembler(inputCols=["HourVector"], outputCol="Features"), LinearRegression(featuresCol="Features", labelCol="DepDelay", predictionCol="DepDelayPredicted", regParam=0.0) ]) model = pipeline.fit(trainingDF) resultDF = model.transform(testDF)
df = df.withColumn('publish_time_2',regexp_replace(df.publish_time_2, 'Z', '')) df = df.withColumn("publish_time_3", to_timestamp(df.publish_time_2, 'yyyy-MM-dd HH:mm:ss.SSS')) print(df.printSchema()) df.select("publish_time", "publish_time_2","publish_time_3").show(5,False) # Notice the .000 on the end of publish_time_new as opposed to publish_time_new_t # **Translate Function** # # You could also use the Translate function here to do this, where the first set of values is what you are looking for and the second set is what you want to replace those values with respectively. # In[40]: import pyspark.sql.functions as f df.select("publish_time",f.translate(f.col("publish_time"), "TZ", " ").alias("translate_func")).show(5,False) # **Trim** # # One common function you've probably seen in almost any data processing tool including excel is the "trim" function which removes leading and trailing white space from a cell in various ways. Let's go ahead and do that with the title field. # In[41]: # Trim # pyspark.sql.functions.trim(col) - Trim the spaces from both ends for the specified string column. from pyspark.sql.functions import * df = df.withColumn('title',trim(df.title)) # or rtrim/ltrim df.select("title").show(5,False)
######### load vertices verticesText0 = spark.read.csv(File_verticesTextRDD, header='false', inferSchema='false', sep='\t') verticesText1 = verticesText0.select("_c1","_c0","_c2","_c3","_c4")\ .withColumnRenamed("_c0", "nodeType").withColumnRenamed("_c1", "id")\ .withColumnRenamed("_c2", "attr1").withColumnRenamed("_c3", "attr2")\ .withColumnRenamed("_c4", "attr3") ######### load edges edgesText0 = spark.read.csv(File_edgesTextRDD) edgesText0 = edgesText0.select( f.translate(f.col("_c0"), "Edge(", "").alias("src"), "_c1", f.translate(f.col("_c2"), ")", "").alias("label")) edgesText1 = edgesText0.select("*").withColumnRenamed("_c1", "dst") verticesText1J = verticesText1 edgesText2 = edgesText1.join( verticesText1.select("id", "nodeType"), edgesText1.src == verticesText1.select("id", "nodeType").id, "inner") edgesText2 = edgesText2.withColumnRenamed("id", "src_id").withColumnRenamed( "nodeType", "src_nodeType") edgesText3 = edgesText2.join( verticesText1.select("id", "nodeType"), edgesText2.dst == verticesText1.select("id", "nodeType").id, "inner") edgesText3 = edgesText3.withColumnRenamed("id", "dst_id").withColumnRenamed( "nodeType", "dst_nodeType")
.builder \ .getOrCreate() # create a dataframe out of it by using the first row as field names and trying to infer a schema based on contents df = spark.read.option("header", "true").option( "inferSchema", "true").csv('noaa-weather-data-jfk-airport/jfk_weather.csv') # register a corresponding query table. we do this to save the data in memory and run our operations on it. df.createOrReplaceTempView('df') # cleaning the data as it contains trailing charcters. Double is a data type like float # columns with no trailing charecters were straight converrted to double type, rest were first cleaned df_cleaned = df \ .withColumn("HOURLYWindSpeed", df.HOURLYWindSpeed.cast('double')) \ .withColumn("HOURLYWindDirection", df.HOURLYWindDirection.cast('double')) \ .withColumn("HOURLYStationPressure", translate(col("HOURLYStationPressure"), "s,", "")) \ .withColumn("HOURLYPrecip", translate(col("HOURLYPrecip"), "s,", "")) \ .withColumn("HOURLYRelativeHumidity", translate(col("HOURLYRelativeHumidity"), "*", "")) \ .withColumn("HOURLYDRYBULBTEMPC", translate(col("HOURLYDRYBULBTEMPC"), "*", "")) \ # the cleaned columsn were now chanegd to double types df_cleaned = df_cleaned \ .withColumn("HOURLYStationPressure", df_cleaned.HOURLYStationPressure.cast('double')) \ .withColumn("HOURLYPrecip", df_cleaned.HOURLYPrecip.cast('double')) \ .withColumn("HOURLYRelativeHumidity", df_cleaned.HOURLYRelativeHumidity.cast('double')) \ .withColumn("HOURLYDRYBULBTEMPC", df_cleaned.HOURLYDRYBULBTEMPC.cast('double')) \ # Filtering for clean data set with no nulls and wind speed not 0 df_filtered = df_cleaned.filter(""" HOURLYWindSpeed <> 0 and HOURLYWindSpeed IS NOT NULL
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 # ETL TBHV # Custom function def doSplitWord(word): size = len(word) rs = [word[i:i + 2] for i in range(0, size, 1)] rs1 = [word[i:i + 1] for i in range(0, size, 1)] rs.extend(rs1) return rs state_right = 'state_right' state_wrong = 'state_wrong' # mac dinh duoc cong knowledge # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2 knowledge = '' # cong diem comprehension: # Can list cac name duoc cong diem comprehension: # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 comprehension = [ 'P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem application: # Can list cac name duoc cong diem application: # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 application = [ 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem analysis: # Can list cac name duoc cong diem analysis # P2_D3; P3_D2; P4_D1; P4_D2 analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem synthesis: # Can list cac name duoc cong diem synthesis # P4_D1; P4_D2 synthesis = ['P4_D1', 'P4_D2'] # cong diem evaluation: # Can list cac name duoc cong diem evaluation evaluation = '' def doAddScore(name, state, type): arr = [''] score = 0 if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis name = name.lower() if state == state_right: score = 10 if state == state_wrong: score = -5 if name is not None: for x in arr: if x.lower() in name: return score return 0 addScore = udf(doAddScore, IntegerType()) def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) # chuoi ky tu can replace special_str = '["] ;' splitWord = udf(lambda x: doSplitWord(x)) ########## top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts") dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields( ['_key', 'id', 'timestart', 'quiz']) dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice( specs=[('_key', 'cast:long')]) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show(2) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') dyf_top_quiz_attempts = Filter.apply( frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show() if dyf_top_quiz_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user") dyf_top_user = dyf_top_user.select_fields(['id', 'student_id']).rename_field( 'id', 'top_user_id') ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question") dyf_top_question = dyf_top_question.select_fields( ['id', 'name']).rename_field('id', 'quest_id') # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_result_ai dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai") dyf_top_result_ai = dyf_top_result_ai.select_fields([ 'question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word' ]) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'quest_id') dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id') dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918]) dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id') # dyf_join02.show() df_study = dyf_join02.toDF() df_study.cache() if (df_study.count() > 0): try: # print("COUNT 1:", df_study.count()) # Loc cac ky tu dac biet [ ] ", # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ... # df_study = df_study.select( # 'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word, # special_str, ''), f.translate(df_study.wrong_word, # special_str, '')) df_study = df_study.select('quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word') df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \ .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, '')) # Tach cau thanh array tu: # house, her => [house, her] # PHan tich tu dung df_study_right = df_study.withColumn( "right_word_list", f.split(df_study.right_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_right = df_study_right.withColumn( "right", f.explode(df_study_right.right_word_list)) df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right') df_study_right = df_study_right.withColumn( "right", f.lower(f.col("right"))) # print("COUNT 2:", df_study_right.count()) # df_study_right.printSchema() # df_study_right.show() dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right") ## Learning Object dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object") dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name']) df_learning_object = dyf_learning_object.toDF() # convert to lowercase df_learning_object = df_learning_object.withColumn( "learning_object_name", f.lower(f.col("learning_object_name"))) dyf_learning_object = DynamicFrame.fromDF( df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_right = dyf_knowledge_right.toDF() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(10)) \ .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) df_knowledge_right = df_knowledge_right.groupby( 'student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_plus"), f.sum('knowledge').alias("knowledge_plus"), f.sum('comprehension').alias("comprehension_plus"), f.sum('application').alias("application_plus"), f.sum('analysis').alias("analysis_plus"), f.sum('synthesis').alias("synthesis_plus"), f.sum('evaluation').alias("evaluation_plus")) df_knowledge_right = df_knowledge_right.where( 'student_id is not null') # df_knowledge_right.printSchema() # df_knowledge_right.show() # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timestart", "long", "timestart", "long"), # ("student_id", 'int', 'student_id', 'long'), # ("learning_object_id", "int", "learning_object_id", "int"), # ("date_id", "string", "date_id", "int"), # ("knowledge", "int", "knowledge", "int"), # ("comprehension", "int", "comprehension", "int"), # ("application", "int", "application", "int"), # ("analysis", "int", "analysis", "int"), # ("synthesis", "int", "synthesis", "int"), # ("evaluation", "int", "evaluation", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice2") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "temp_right_wrong_learning_object", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") # END Cong diem cac tu dung ################################################# # Tru diem cac tu sai: Xu lu tuong tu tu dung. # rule tru diem la -5 diem neu sai df_study_wrong = df_study.withColumn( "wrong_word_list", f.split(df_study.wrong_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_wrong = df_study_wrong.withColumn( "wrong", f.explode(df_study_wrong.wrong_word_list)) #convert to lowercase df_study_wrong = df_study_wrong.withColumn( "wrong", f.lower(f.col("wrong"))) df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong') # print("COUNT 2:", df_study_wrong.count()) # df_study_wrong.printSchema() # df_study_wrong.show() dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong") ## Learning Object dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_wrong = dyf_knowledge_wrong.toDF() df_knowledge_wrong.cache() df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-5)) \ .withColumn("comprehension", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd')) df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_minus"), f.sum('knowledge').alias("knowledge_minus"), f.sum('comprehension').alias("comprehension_minus"), f.sum('application').alias("application_minus"), f.sum('analysis').alias("analysis_minus"), f.sum('synthesis').alias("synthesis_minus"), f.sum('evaluation').alias("evaluation_minus"))\ .withColumnRenamed('student_id', 'student_id_wrong') \ .withColumnRenamed('date_id', 'date_id_wrong') \ .withColumnRenamed('learning_object_id', 'learning_object_id_wrong') df_knowledge_wrong = df_knowledge_wrong.where( 'student_id_wrong is not null') # df_study_all = df_study.select('student_id').withColumnRenamed('student_id', 'student_id_all') # df_knowledge_right.printSchema() # df_knowledge_right.show() df_knowledge = df_knowledge_right.join( df_knowledge_wrong, (df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & (df_knowledge_right['date_id'] == df_knowledge_wrong['date_id_wrong']) & (df_knowledge_right['learning_object_id'] == df_knowledge_wrong['learning_object_id_wrong']), 'outer') df_knowledge = df_knowledge.withColumn("user_id", check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \ .withColumn("learning_object_id", check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \ .withColumn("created_date_id", check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \ .withColumn("source_system", f.lit('top_result_ai')) \ .withColumn("lu_id", f.lit(0)) dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge") applymapping2 = ApplyMapping.apply( frame=dyf_knowledge, mappings=[ ("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) applymapping2.printSchema() applymapping2.show(20) resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice3") dropnullfields2 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") print('COUNT df_knowledge: ', dropnullfields2.count()) dropnullfields2.printSchema() dropnullfields2.show(2) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields2, connection_type="s3", connection_options={ "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/", "partitionKeys": ["created_date_id", "source_system"] }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_result_ai/", # transformation_ctx="datasink5") # END Tru diem cac tu sai # xoa cache df_study.unpersist() df_knowledge_right.unpersist() df_knowledge_wrong.unpersist() # df_knowledge_right.unpersist() except Exception as e: print( "###################### Exception ##########################" ) print(e) # ghi flag # lay max key trong data source mdl_dyf_top_quiz_attempts = dyf_top_quiz_attempts.toDF() flag = mdl_dyf_top_quiz_attempts.agg({ "_key": "max" }).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet", mode="overwrite")
.alias("color_cleaned"), col("Description"))\ .show(2) """ = SELECT regexp_replace(Description, 'BLACK|WHITE|RED|GREEN|BLUE', 'COLOR') as color_cleaned, Description FROM dfTable """ #2 replace characters with diferent characters print("2") df.select( translate(col("Description"), "LEET", "1327"), col("Description") )\ .show(2) """ SELECT translate(Description, 'LEET', '1327'), Description FROM dfTable """ #3 pulling out the first mentioned color print("3")
"rowTag", "inproceedings").option('charset', "UTF-8").schema(schema).load(unescaped_src.name) articles_df = spark.read.format('com.databricks.spark.xml').option("rowTag", "article").option('charset', "UTF-8").schema(schema).load( unescaped_src.name) #Almacenamos los jsons incollections_df.write.option("charset", "UTF-8").json('./json/incollections') inproceedings_df.write.option("charset", "UTF-8").json('./json/inproceedings') articles_df.write.option("charset", "UTF-8").json('./json/articles') #Agrupamos y almacenamos los csv publications_df = incollections_df.withColumn('LABEL',lit('Incollection')).union( inproceedings_df.withColumn('LABEL',lit('Inproceeding'))).union( articles_df.withColumn('LABEL', lit('Article'))) publications_df = publications_df.filter(publications_df._key.isNotNull()) publications_df.withColumn('id', translate('_key', '/', '_')).select('id', 'title', 'year', 'LABEL').write.option('escape', '"').csv( './csv/publications') publications_df.withColumn('_author', explode('author._VALUE')).select( '_author').write.option('escape', '"').csv('./csv/authors') publications_df.withColumn('start', explode('author._VALUE')).withColumn( 'end', translate('_key', '/', '_')).select('start', 'end').write.option( 'escape', '"').csv('./csv/rels') sc.stop()
# COMMAND ---------- # MAGIC %md # MAGIC ## Fixing Data Types # MAGIC # MAGIC Take a look at the schema above. You'll notice that the `price` field got picked up as string. For our task, we need it to be a numeric (double type) field. # MAGIC # MAGIC Let's fix that. # COMMAND ---------- from pyspark.sql.functions import col, translate fixedPriceDF = baseDF.withColumn( "price", translate(col("price"), "$,", "").cast("double")) display(fixedPriceDF) # COMMAND ---------- # MAGIC %md # MAGIC ## Summary statistics # MAGIC # MAGIC Two options: # MAGIC * describe # MAGIC * summary (describe + IQR) # COMMAND ---------- display(fixedPriceDF.describe())
def main(sc): """ Main processing function Read in data from PostgreSQL transaction table Perform reverse lookup for vin transactions and return input Bitcoin values and addresses Perform disjoint set (i.e., union find) algorithm using GraphFrames Write out address clustering results to PostgreSQL """ # ---READ IN TRANSACTION DATA AND PERFORM REVERSE TX LOOKUP USING JOINS--- # create initial SQL query # tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {} WHERE height <= 400000 LIMIT 5000000"\ tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {}"\ .format(config.SPARK_CONFIG['PG_TX_TABLE']) # read in data from PostgreSQL tx_df = spark.read \ .format(config.SPARK_CONFIG['PG_FORMAT']) \ .option("url", config.SPARK_CONFIG['PG_URL'] + config.SPARK_CONFIG['PG_PORT'] + "/" + config.SPARK_CONFIG['PG_DB']) \ .option("user", config.SPARK_CONFIG['PG_USER']) \ .option("password", config.SPARK_CONFIG['PG_PASSWORD'])\ .option("query", tx_query) \ .option("numPartitions", '10000') \ .load() # display_df(tx_df) # select priority columns, convert array columns, and zip vin and vout fields clean_df = tx_df.withColumn("vin_txid_arr", split(col("vin_txid"), ",\s*")) \ .withColumn("vin_vout_arr", split(col("vin_vout"), ",\s*")) \ .withColumn("vin_txid_vout_zip", arrays_zip("vin_txid_arr", "vin_vout_arr")) \ .withColumn("vout_value_arr", split(col("vout_value"), ",\s*")) \ .withColumn("vout_n_arr", split(col("vout_n"), ",\s*")) \ .withColumn("vout_addresses_arr", split(col("vout_addresses"), ",\s*")) \ .withColumn("vout_value_n_addr_zip", arrays_zip("vout_value_arr", "vout_n_arr", "vout_addresses_arr")) # display_df(clean_df) # # create left side DataFrame vin_cols = [ 'txid', 'height', 'time', 'ntx', 'vin_coinbase', 'vin_txid_vout_zip' ] vin_df = clean_df.select(vin_cols) \ .withColumn("vin_txid_vout_tup", explode("vin_txid_vout_zip")) \ .withColumn("vin_txid", col("vin_txid_vout_tup").vin_txid_arr) \ .withColumn("vin_vout", col("vin_txid_vout_tup").vin_vout_arr) \ .drop("vin_txid_vout_zip") \ .drop("vin_txid_vout_tup") \ .withColumn("left_key", concat(col("vin_txid"), lit("-"), col("vin_vout"))) # display_df(vin_df) # create right side DataFrame vout_cols = ['txid', 'vout_value_n_addr_zip'] vout_df = clean_df.select(vout_cols) \ .withColumn("vout_value_n_addr_tup", explode("vout_value_n_addr_zip")) \ .withColumn("vout_value", col("vout_value_n_addr_tup").vout_value_arr) \ .withColumn("vout_n", col("vout_value_n_addr_tup").vout_n_arr) \ .withColumn("vout_addr_pre", col("vout_value_n_addr_tup").vout_addresses_arr) \ .withColumn("vout_addr", translate(col("vout_addr_pre"), '[]', '')) \ .drop("vout_value_n_addr_zip") \ .drop("vout_value_n_addr_tup") \ .drop("vout_addr_pre") \ .withColumnRenamed("txid", "txid2") \ .withColumn("right_key", concat(col("txid2"), lit("-"), col("vout_n"))) \ .drop("txid2") # display_df(vout_df) # join DataFrames join_df = vin_df.join(vout_df, vin_df.left_key == vout_df.right_key, 'left') \ .drop("left_key") \ .drop("right_key") # display_df(join_df) # create temporary table for GraphFrames join_df.registerTempTable("join_result") # ---CREATING GRAPHFRAME FOR CONNECTED COMPONENTS ALGORITHM--- # create vertices DataFrame vertices = spark.sql( "SELECT DISTINCT(vout_addr) FROM join_result").withColumnRenamed( "vout_addr", "id") # generate DataFrame with single address connection for all addresses in a given txid group w = Window.partitionBy("txid").orderBy("vout_addr") first_by_txid_df = join_df.withColumn("rn", row_number().over(w)).where(col("rn") == 1) \ .withColumnRenamed("txid", "txid2") \ .withColumnRenamed("vout_addr", "vout_addr_first") \ .drop("rn") \ .drop("height") # first_by_txid_df.show(100) # join DataFrames interim_df = join_df.join(first_by_txid_df, join_df.txid == first_by_txid_df.txid2, 'left') # create edges DataFrame edges = interim_df.select("vout_addr", "vout_addr_first") \ .withColumnRenamed("vout_addr", "src") \ .withColumnRenamed("vout_addr_first", "dst") \ .na.drop() # create GraphFrame g = GraphFrame(vertices, edges) # set checkpoint directory in S3 sc.setCheckpointDir(config.SPARK_CONFIG['S3_CHECKPOINT']) # run connected components clst_result = g.connectedComponents() clst_result.show(100, truncate=False) # # ---FOR TESTING ONLY--- show result DataFrame for a specific block to verify correct results # clst_result.registerTempTable("clst_table") # view_df = spark.sql("SELECT * FROM clst_table ORDER BY clst_table.component") # view_df.show(1000, truncate=False) # write out to PostgreSQL write_clst_to_pg(clst_result)
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 # ETL TBHV ## Phonetic dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object" ) dyf_phonemic = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'phonetic') dyf_phonemic = dyf_phonemic.select_fields(['learning_object_id', 'learning_object_name']) # Lay ra ngu am df1 = dyf_phonemic.toDF() df1 = df1.select('learning_object_id', 'learning_object_name') # myArr = np.array(df1.select('phonemic').collect()) arrPhonetic = [row.learning_object_name for row in df1.collect()] arrPhoneticId = [[row.learning_object_name, row.learning_object_id] for row in df1.collect()] # print('ARR:', arrPhonetic) # print('ARR1 :', (u'i:' in arrPhonetic)) lu_type = [] # check value for lu_id: valid = 1, invalid = 0 def doAddLuId(code): code = str(code) if code is None: return 0 if code not in lu_type: return 0 else: return 1 add_lu_id = udf(doAddLuId, IntegerType()) def doCheckLyType(plus, minus): if plus == 1: return plus if minus == 1: return minus return 0 check_lu_type = udf(doCheckLyType, IntegerType()) # Custom function def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_get_phone_tic_id(phonetic): phonetic = phonetic.encode('utf-8', 'replace').strip() for x in arrPhoneticId: p = x[0].encode('utf-8', 'replace').strip() if p == phonetic: return x[1] get_phone_tic_id = udf(do_get_phone_tic_id, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) def doSplitWord(word): rs = [] if word is not None: i = 0 size = len(word) while i < size: s = word[i:i + 2] i += 2 if s in arrPhonetic: rs.append(s) if s not in arrPhonetic: i -= 2 s = word[i:i + 1] i += 1 if s in arrPhonetic: rs.append(s) return rs # print('test:', doSplitWord('abcacd')) splitWord = udf(lambda x: doSplitWord(x)) knowledge = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'], ['P02', 'Basic'], ['P03', 'sbasic'], ['P03', 'basic'], ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None], ['L02', None], ['L03', None], ['L04', None], ['L05', None]] comprehension = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'], ['P02', 'basic'], ['P03', None], ['P03', 'basic'], ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None], ['L02', None], ['L03', None], ['L04', None], ['L05', None]] application = [['L04', None], ['L04', None], ['L05', None]] analysis = [] synthesis = [] evaluation = [] state_gradedright = 'gradedright' def doAddScore(name, parName, state, type): arr = [] score = 0 if type == 'knowledge': arr = knowledge if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis if type == 'evaluation': arr = evaluation if state is not None and state == state_gradedright: score = 2 else: score = -1 for x in arr: if x[0] is None and x[1] == parName: return score if x[0] == name and x[1] is None: return score if x[0] == name and x[1] is not None and x[1].lower() in parName.lower(): return score return 0 addScore = udf(doAddScore, IntegerType()) # print('CHECK:', checkContains('ABCD EFHFF')) # chuoi ky tu can replace special_str = '["].' ########## top_question_attempts dyf_top_question_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempts" ) dyf_top_question_attempts = dyf_top_question_attempts.select_fields( ['_key', 'id', 'rightanswer', 'questionid', 'questionusageid', 'timemodified']) dyf_top_question_attempts = dyf_top_question_attempts.resolveChoice(specs=[('_key', 'cast:long')]) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_question_attempts = Filter.apply(frame=dyf_top_question_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') print('number of dyf_top_question_attempts: ', dyf_top_question_attempts.count()) if dyf_top_question_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user" ) dyf_top_user = dyf_top_user.select_fields( ['id', 'student_id']).rename_field('id', 'top_user_id') ######### top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts" ) dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields( ['userid', 'uniqueid']) ######### top_question_attempt_steps dyf_top_question_attempt_steps = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempt_steps" ) dyf_top_question_attempt_steps = dyf_top_question_attempt_steps.select_fields( ['id', 'questionattemptid', 'state']).rename_field('id', 'steps_id') print dyf_top_question_attempts.count() dyf_top_question_attempts.show(2) dyf_top_question_attempts = Filter.apply(frame=dyf_top_question_attempts, f=lambda x: x["timemodified"] >= timestamp) print dyf_top_question_attempts.count() dyf_top_question_attempts.show() ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question" ) dyf_top_question = dyf_top_question.select_fields( ['id', 'name', 'category']).rename_field('id', 'quest_id') # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_question_categories dyf_top_question_categories = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_categories" ) dyf_top_question_categories = dyf_top_question_categories.select_fields( ['id', 'name', 'parent']).rename_field('id', 'quest_cat_id') ######### dyf_top_question_categories_parent dyf_top_question_categories_parent = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_categories" ) dyf_top_question_categories_parent = dyf_top_question_categories_parent.select_fields( ['id', 'name']).rename_field('id', 'par_id').rename_field('name', 'par_name') # print("COUNT dyf_top_question_attempts:", dyf_top_question_attempts.count()) # print("COUNT dyf_top_question:", dyf_top_question.count()) # print("COUNT dyf_top_question_attempt_steps:", dyf_top_question_attempt_steps.count()) # print("COUNT dyf_top_question_categories:", dyf_top_question_categories.count()) # dyf_top_question_attempt_steps = Filter.apply(frame=dyf_top_question_attempt_steps, f=lambda x: x["steps_id"]) # JOIN va FILTER cac bang theo dieu kien dyf_join = Join.apply(dyf_top_question_attempts, dyf_top_quiz_attempts, 'questionusageid', 'uniqueid') dyf_top_question_attempt_steps = Filter.apply(frame=dyf_top_question_attempt_steps, f=lambda x: x["state"] == state_gradedright) df_top_question_attempt_steps = dyf_top_question_attempt_steps.toDF() df_join = dyf_join.toDF() df_join01 = df_join.join(df_top_question_attempt_steps, (df_join['id'] == df_top_question_attempt_steps['questionattemptid']), 'left') dyf_join01 = DynamicFrame.fromDF(df_join01, glueContext, "dyf_join01") # dyf_join01 = Join.apply(dyf_top_question_attempt_steps, dyf_top_question_attempts, 'questionattemptid', 'id') # print("COUNT 1:", dyf_join01.count()) # dyf_join01.printSchema() dyf_join02 = Join.apply(dyf_join01, dyf_top_question, 'questionid', 'quest_id') # print("COUNT 2:", dyf_join02.count()) # dyf_join02.printSchema() dyf_join03 = Join.apply(dyf_join02, dyf_top_question_categories, 'category', 'quest_cat_id') dyf_join03 = Join.apply(dyf_join03, dyf_top_question_categories_parent, 'parent', 'par_id') dyf_join03 = Join.apply(dyf_join03, dyf_top_user, 'userid', 'top_user_id') # print("COUNT 3:", dyf_join03.count()) dyf_join03.printSchema() dyf_join03 = dyf_join03.select_fields( ['student_id', 'rightanswer', 'timemodified', 'state', 'name', 'parent', 'par_name']) # dyf_join03.printSchema() # dyf_join03.show() # dyf_right = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] == state_gradedright) # dyf_wrong = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] != state_gradedright) # dyf_join02.show() df_right = dyf_join03.toDF() df_right.cache() if (df_right.count() > 0): try: # print("COUNT 1:", df_right.count()) # Loc cac ky tu dac biet [ ] ", # Tach cau thanh array tu: # house, her => [house, her] df_right = df_right.withColumn("right_str", f.translate(df_right.rightanswer, special_str, '')) df_right = df_right.withColumn("right_arr", f.split(df_right.right_str, ' ')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_right = df_right.withColumn("right", f.explode(df_right.right_arr)) # print("COUNT 2:", df_right.count()) # df_right.printSchema() dyf_right = DynamicFrame.fromDF(df_right, glueContext, "dyf_right") ## Learning Object: loc lay dang tu vung de doc ngu am dyf_learning_object = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'vocabulary') dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name', 'transcription']) df_learning_object = dyf_learning_object.toDF() # replace cac ky tu df_learning_object = df_learning_object.withColumn("phone_tic_new", f.translate(df_learning_object.transcription, '\',', '')) df_learning_object = df_learning_object.withColumn("phone_tic_tmp", splitWord(df_learning_object.phone_tic_new)) df_learning_object = df_learning_object.withColumn("phone_tic_tmp_01", f.translate(df_learning_object.phone_tic_tmp, '[]', '')) df_learning_object = df_learning_object.withColumn("phone_tic_arr", f.split(df_learning_object.phone_tic_tmp_01, ',')) df_learning_object = df_learning_object.select('learning_object_id', 'learning_object_name', 'phone_tic_arr') dyf_learning_object = DynamicFrame.fromDF(df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_right, dyf_learning_object, 'right', 'learning_object_name') dyf_knowledge_right = dyf_knowledge_right.select_fields( ['student_id', 'learning_object_id', 'name', 'parent', 'timemodified', 'par_name', 'state', 'phone_tic_arr']) # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # dyf_knowledge_right.show() # # print("COUNT 4:", dyf_knowledge_wrong.count()) # # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_right = dyf_knowledge_right.toDF() df_knowledge_right = df_knowledge_right.withColumn("right_phonetic", f.explode(df_knowledge_right.phone_tic_arr)) df_knowledge_right = df_knowledge_right.select('student_id', 'name', 'timemodified', 'par_name', 'state', 'right_phonetic') df_knowledge_right = df_knowledge_right.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_right.right_phonetic)) # dyf_study_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_study_right") # dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'right_phonetic', 'learning_object_name') # df_knowledge_right = dyf_phonemic_right.toDF() df_knowledge_right = df_knowledge_right.withColumn("knowledge", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit("knowledge"))) \ .withColumn("comprehension", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('synthesis'))) \ .withColumn("evaluation", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'], df_knowledge_right['state'], f.lit('evaluation'))) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timemodified'], 'yyyyMMdd')) \ .withColumn("lo_type", f.lit(2)) # df_knowledge_right.printSchema() # df_knowledge_right.show() df_knowledge_right.cache() # History # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # dyf_knowledge_right = dyf_knowledge_right.resolveChoice(specs=[('lo_type', 'cast:int')]) # df_knowledge_right = dyf_knowledge_right.toDF() # chon cac truong va kieu du lieu day vao db # dyf_ai_history_plus = Filter.apply(frame=dyf_knowledge_right, f=lambda x: x['knowledge'] > 0) # # dyf_ai_history_minus = Filter.apply(frame=dyf_knowledge_right, f=lambda x: x['knowledge'] < 0) df_ai_history_plus = df_knowledge_right.where('knowledge > 0') df_ai_history_plus = df_ai_history_plus.groupby('student_id', 'learning_object_id', 'date_id').agg( f.count("student_id").alias("count_plus"), f.sum("knowledge").alias("knowledge_plus"), f.sum("comprehension").alias("comprehension_plus"), f.sum("application").alias("application_plus"), f.sum("analysis").alias("analysis_plus"), f.sum("synthesis").alias("synthesis_plus"), f.sum("evaluation").alias("evaluation_plus")) df_ai_history_plus = df_ai_history_plus.withColumnRenamed('student_id', 'student_id_plus') \ .withColumnRenamed('learning_object_id', 'learning_object_id_plus') \ .withColumnRenamed('date_id', 'date_id_plus') # .withColumnRenamed('lu_type', 'lu_type_plus') df_ai_history_plus = df_ai_history_plus.where('student_id_plus is not null') # dyf_ai_history_plus = DynamicFrame.fromDF(df_ai_history_plus, glueContext, "dyf_ai_history_plus") # # dyf_ai_history_plus = dyf_ai_history_plus.select_fields( # ['date_id', 'student_id', 'learning_object_id', 'lo_type', 'knowledge_plus', 'comprehension_plus', # 'application_plus', 'analysis_plus', # 'synthesis_plus', 'evaluation_plus', 'count_plus']).rename_field('student_id', # 'student_id_plus').rename_field( # 'date_id', 'date_id_plus').rename_field('lo_type', 'lo_type_plus').rename_field('id', # 'learning_object_id_plus') df_ai_history_minus = df_knowledge_right.where('knowledge < 0') df_ai_history_minus = df_ai_history_minus.groupby('student_id', 'learning_object_id', 'date_id').agg( f.count("student_id").alias("count_minus"), f.sum("knowledge").alias("knowledge_minus"), f.sum("comprehension").alias("comprehension_minus"), f.sum("application").alias("application_minus"), f.sum("analysis").alias("analysis_minus"), f.sum("synthesis").alias("synthesis_minus"), f.sum("evaluation").alias("evaluation_minus")) df_ai_history_minus = df_ai_history_minus.withColumnRenamed('student_id', 'student_id_minus') \ .withColumnRenamed('learning_object_id', 'learning_object_id_minus') \ .withColumnRenamed('date_id', 'date_id_minus') # .withColumnRenamed('lu_type', 'lu_type_minus') df_ai_history_minus = df_ai_history_minus.where('student_id_minus is not null') print("AAAAAAAAAAAAAAAAAAAAAAAA") # dyf_ai_history_minus = DynamicFrame.fromDF(df_ai_history_minus, glueContext, "dyf_ai_history_plus") # dyf_ai_history_minus = dyf_ai_history_minus.select_fields( # ['date_id', 'student_id', 'id', 'lo_type', 'knowledge_minus', 'comprehension_minus', # 'application_minus', 'analysis_minus', 'synthesis_minus', 'evaluation_minus', # 'count_minus']).rename_field('student_id', 'user_id_minus').rename_field( # 'date_id', 'date_id_minus').rename_field('lo_type', 'lo_type_minus').rename_field('id', # 'learning_object_id_minus') # dyf_ai_history_minus.printSchema() # dyf_ai_history_minus.show(2) # dyf_ai_history_plus.printSchema() # dyf_ai_history_plus.show(2) print ("###########################################") # df_ai_history_minus = dyf_ai_history_minus.toDF() # df_ai_history_plus = dyf_ai_history_plus.toDF() df_join_history = df_ai_history_plus.join(df_ai_history_minus, ( df_ai_history_plus['student_id_plus'] == df_ai_history_minus['student_id_minus']) & (df_ai_history_plus['date_id_plus'] == df_ai_history_minus[ 'date_id_minus']) & (df_ai_history_plus['learning_object_id_plus'] == df_ai_history_minus['learning_object_id_minus']), 'outer') df_join_history = df_join_history.withColumn("created_date_id", check_data_null(df_join_history.date_id_plus, df_join_history.date_id_minus)) \ .withColumn("user_id", check_data_null(df_join_history.student_id_plus, df_join_history.student_id_minus)) \ .withColumn("source_system", f.lit("top_question_attempt_phonetic")) \ .withColumn("learning_object_id", check_data_null(df_join_history.learning_object_id_plus, df_join_history.learning_object_id_minus)) \ .withColumn("lu_id", f.lit(0)) # .withColumn("lu_id", check_lu_type(df_join_history.lu_type_plus, df_join_history.lu_type_minus)) join_history = DynamicFrame.fromDF(df_join_history, glueContext, 'join_history') # join_history.printSchema() # join_history.printSchema() ################ applymapping1 = ApplyMapping.apply(frame=join_history, mappings=[("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields") print(dropnullfields1.count()) dropnullfields1.show(5) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/"}, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_history", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp1/mapping_lo_student_history/", transformation_ctx="datasink5") # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # dyf_knowledge_right = dyf_knowledge_right.resolveChoice(specs=[('lo_type', 'cast:byte')]) # # df_knowledge_right = dyf_knowledge_right.toDF() # # chon cac truong va kieu du lieu day vao db # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timemodified", "long", "timestart", "long"), # ("name", "string", "name", "string"), # ("par_name", "string", "par_name", "string"), # ("student_id", 'int', 'student_id', 'long'), # ("learning_object_id", "long", "learning_object_id", "int"), # ("date_id", "string", "date_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), # ("phone_tic", "string", "phone_tic", "long"), # ("lo_type", "byte", "lo_type", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice2") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # xoa cache df_right.unpersist() df_knowledge_right.unpersist() # df_knowledge_right.unpersist() # lay max _key tren datasource df_temp = dyf_top_question_attempts.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag moi vao s3 df.write.parquet("s3a://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts", mode="overwrite") except Exception as e: print("###################### Exception ##########################") print(e)
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 ## Phonetic dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object" ) dyf_phonemic = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'phonetic') dyf_phonemic = dyf_phonemic.select_fields(['learning_object_id', 'learning_object_name']) # df_phonemic = dyf_phonemic.toDF() # df_phonemic = df_phonemic.withColumn('lo_name', convertedudf(df_phonemic.learning_object_name)) # df_phonemic.show() # Lay ra ngu am df1 = dyf_phonemic.toDF() df1 = df1.select('learning_object_id', 'learning_object_name') # myArr = np.array(df1.select('phonemic').collect()) arrPhonetic = [row.learning_object_name for row in df1.collect()] arrPhoneticId = [[row.learning_object_name, row.learning_object_id] for row in df1.collect()] # print(unicode(arrPhonetic[2])) # print('ARR:', arrPhonetic) # print('ARR:', arrPhonetic[2].encode('utf-8', 'replace')) # print('ARR1 :', (u'i:' in arrPhonetic)) # ETL TBHV # Custom function def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_get_phone_tic_id(phonetic): phonetic = phonetic.encode('utf-8', 'replace').strip() for x in arrPhoneticId: p = x[0].encode('utf-8', 'replace').strip() if p == phonetic: return x[1] get_phone_tic_id = udf(do_get_phone_tic_id, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) def doSplitWord(word): rs = [] if word is not None: i = 0 size = len(word) while i < size: s = word[i:i + 2] i += 2 if s in arrPhonetic: rs.append(s) if s not in arrPhonetic: i -= 2 s = word[i:i + 1] i += 1 if s in arrPhonetic: rs.append(s) return rs splitWord = udf(lambda x: doSplitWord(x)) state_right = 'state_right' state_wrong = 'state_wrong' # mac dinh duoc cong knowledge # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2 # knowledge = [] # cong diem comprehension: # Can list cac name duoc cong diem comprehension: # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 comprehension = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem application: # Can list cac name duoc cong diem application: # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 application = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem analysis: # Can list cac name duoc cong diem analysis # P2_D3; P3_D2; P4_D1; P4_D2 analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem synthesis: # Can list cac name duoc cong diem synthesis # P4_D1; P4_D2 synthesis = [] # cong diem evaluation: # Can list cac name duoc cong diem evaluation evaluation = [] def doAddScore(name, state, type): arr = [''] score = 0 if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis name = name.lower() if state == state_right: score = 2 if state == state_wrong: score = -1 if name is not None: for x in arr: if x.lower() in name: return score return 0 addScore = udf(doAddScore, IntegerType()) # chuoi ky tu can replace special_str = '["] ;' ########## top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts" ) dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(['_key', 'id', 'timestart', 'quiz']) dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(specs=[('_key', 'cast:long')]) # print dyf_top_quiz_attempts.count() # dyf_top_quiz_attempts.show(2) dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp) # print dyf_top_quiz_attempts.count() # dyf_top_quiz_attempts.show() # xu ly truong hop start_read is null # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') # print('the number of new contacts: ', dyf_top_quiz_attempts.count()) if dyf_top_quiz_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user" ) dyf_top_user = dyf_top_user.select_fields( ['id', 'student_id']).rename_field('id', 'top_user_id') ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question" ) dyf_top_question = dyf_top_question.select_fields( ['id', 'name']) # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_result_ai dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai" ) dyf_top_result_ai = dyf_top_result_ai.select_fields( ['question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word']) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'id') dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id') dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918]) dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id') # dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["student_id"] == 259442) # dyf_join02.show() df_study = dyf_join02.toDF() df_study.cache() if (df_study.count() > 0): try: # print("COUNT 1:", df_study.count()) # Loc cac ky tu dac biet [ ] " # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ... # df_study = df_study.select( # 'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word, # special_str, ''), f.translate(df_study.wrong_word, # special_str, '')) df_study = df_study.select( 'quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word') df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \ .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, '')) # Tach cau thanh array tu: # house, her => [house, her] # PHan tich tu dung df_study_right = df_study.withColumn("right_word_list", f.split( df_study.right_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_right = df_study_right.withColumn("right", f.explode(df_study_right.right_word_list)) # convert to lowercase df_study_right = df_study_right.withColumn("right", f.lower(f.col("right"))) df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right') # print("COUNT 2:", df_study_right.count()) # df_study_right.printSchema() # df_study_right.show() dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right") ## Learning Object # dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( # database="nvn_knowledge", # table_name="nvn_knowledge_learning_object" # ) dyf_learning_object = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'vocabulary') dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name', 'transcription']) df_learning_object = dyf_learning_object.toDF() # convert to lowercase df_learning_object = df_learning_object.withColumn("learning_object_name", f.lower(f.col("learning_object_name"))) # replace cac ky tu df_learning_object = df_learning_object.withColumn("phone_tic_new", f.translate(df_learning_object.transcription, '\',', '')) df_learning_object = df_learning_object.withColumn("phone_tic_tmp", splitWord(df_learning_object.phone_tic_new)) df_learning_object = df_learning_object.withColumn("phone_tic_tmp_01", f.translate(df_learning_object.phone_tic_tmp, '[]', '')) df_learning_object = df_learning_object.withColumn("phone_tic_arr", f.split(df_learning_object.phone_tic_tmp_01, ',')) df_learning_object = df_learning_object.withColumn("split_phonetic", f.explode(df_learning_object.phone_tic_arr)) df_learning_object = df_learning_object.select('learning_object_id', 'learning_object_name', 'split_phonetic') dyf_learning_object = DynamicFrame.fromDF(df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # 1 df_knowledge_right = dyf_knowledge_right.toDF() # df_knowledge_right = df_knowledge_right.withColumn("right_phonetic", # f.explode(df_knowledge_right.phone_tic_arr)) df_knowledge_right = df_knowledge_right.select('timestart', 'name', 'student_id', 'split_phonetic') df_knowledge_right = df_knowledge_right.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_right.split_phonetic)) # dyf_phonemic_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_phonemic_right") # dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'split_phonetic', 'learning_object_name') # # dropnullfields = DropNullFields.apply(frame=dyf_phonemic_right, transformation_ctx="dropnullfields") # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history_v06", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink6") # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung # df_knowledge_right = dyf_phonemic_right.toDF() # print("COUNT 4:") # df_knowledge_right.printSchema() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(2)) \ .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) \ .withColumn("lo_type", f.lit(2)) dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # dropnullfields = DropNullFields.apply(frame=dyf_knowledge_right, transformation_ctx="dropnullfields") # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history_v02", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink6") # print("COUNT 444444444444444:", df_knowledge_right.count()) # df_knowledge_right.printSchema() # df_knowledge_right.show() # # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # # chon cac truong va kieu du lieu day vao db # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timestart", "long", "timestart", "long"), # ("student_id", 'int', 'student_id', 'long'), # ("name", 'string', 'name', 'string'), # ("learning_object_id", "long", "learning_object_id", "long"), # ("date_id", "string", "date_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), # ("lo_type", "int", "lo_type", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "t_temp_right_learning_object_phonetic", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") # END Cong diem cac tu dung ################################################## # Tru diem cac tu sai: Xu lu tuong tu tu dung. # rule tru diem la -1 diem neu sai df_study_wrong = df_study.withColumn("wrong_word_list", f.split( df_study.wrong_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_wrong = df_study_wrong.withColumn("wrong", f.explode(df_study_wrong.wrong_word_list)) #convert to lowercase df_study_wrong = df_study_wrong.withColumn("wrong", f.lower(f.col("wrong"))) df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong') # print("COUNT 2222:", df_study_wrong.count()) # df_study_wrong.printSchema() # df_study_wrong.show() dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong") ## Learning Object dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name') df_knowledge_wrong = dyf_knowledge_wrong.toDF() # df_knowledge_wrong = df_knowledge_wrong.withColumn("wrong_phonetic", # f.explode(df_knowledge_wrong.phone_tic_arr)) df_knowledge_wrong = df_knowledge_wrong.select('timestart', 'name', 'student_id', 'split_phonetic') df_knowledge_wrong = df_knowledge_wrong.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_wrong.split_phonetic)) # dyf_study_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_study_wrong") # dyf_phonemic_wrong = Join.apply(dyf_study_wrong, dyf_phonemic, 'split_phonetic', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung # df_knowledge_wrong = dyf_phonemic_wrong.toDF() df_knowledge_wrong.cache() df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-1)) \ .withColumn("comprehension", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd')) # df_knowledge_wrong.printSchema() # df_knowledge_wrong.show() # # dyf_knowledge_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_knowledge_wrong") # # # chon cac truong va kieu du lieu day vao db # applymapping1 = ApplyMapping.apply(frame=dyf_knowledge_wrong, # mappings=[("timestart", "long", "timestart", "long"), # ("name", 'string', 'name', 'string'), # ("student_id", 'int', 'student_id', 'long'), # ("id", "int", "learning_object_id", 'long'), # ("date_id", "string", "date_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long")]) # resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", # transformation_ctx="resolvechoice1") # dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1") # # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "t_temp_right_learning_object_phonetic", # "database": "dts_odin", # "postactions": """ call proc_knowledge_ngu_am_top_result_ai () """ # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") ### Luu bang mapping_lo_student_history df_knowledge_right = df_knowledge_right.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_plus"), f.sum('knowledge').alias("knowledge_plus"), f.sum('comprehension').alias("comprehension_plus"), f.sum('application').alias("application_plus"), f.sum('analysis').alias("analysis_plus"), f.sum('synthesis').alias("synthesis_plus"), f.sum('evaluation').alias("evaluation_plus")) df_knowledge_right = df_knowledge_right.where('student_id is not null') df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_minus"), f.sum('knowledge').alias("knowledge_minus"), f.sum('comprehension').alias("comprehension_minus"), f.sum('application').alias("application_minus"), f.sum('analysis').alias("analysis_minus"), f.sum('synthesis').alias("synthesis_minus"), f.sum('evaluation').alias("evaluation_minus")) \ .withColumnRenamed('student_id', 'student_id_wrong') \ .withColumnRenamed('date_id', 'date_id_wrong') \ .withColumnRenamed('learning_object_id', 'learning_object_id_wrong') df_knowledge_wrong = df_knowledge_wrong.where('student_id_wrong is not null') df_knowledge = df_knowledge_right.join(df_knowledge_wrong, ( df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & ( df_knowledge_right['date_id'] == df_knowledge_wrong['date_id_wrong']) & ( df_knowledge_right['learning_object_id'] == df_knowledge_wrong['learning_object_id_wrong']), 'outer') df_knowledge = df_knowledge.withColumn("user_id", check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \ .withColumn("learning_object_id", check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \ .withColumn("created_date_id", check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \ .withColumn("source_system", f.lit('top_result_ai_phonetic')) \ .withColumn("lu_id", f.lit(0)) dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge") # dyf_knowledge.printSchema() dyf_knowledge.printSchema() dyf_knowledge.show() # dyf_knowledge = DynamicFrame.fromDF(dyf_knowledge, glueContext, "dyf_knowledge") # chon cac truong va kieu du lieu day vao db applymapping = ApplyMapping.apply(frame=dyf_knowledge, mappings=[("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/", "partitionKeys": ["created_date_id", "source_system"]}, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink5") ### END Luu bang mapping_lo_student_history # END Tru diem cac tu sai # lay max _key tren datasource datasource = dyf_top_quiz_attempts.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag moi vao s3 df.write.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai", mode="overwrite") # xoa cache df_study.unpersist() df_knowledge_right.unpersist() # df_knowledge_right.unpersist() except Exception as e: print("###################### Exception ##########################") print(e)
distinct = lambda c: F.countDistinct(c) count = F.count sum = F.sum sum_pos = lambda c: F.sum(F.when(c>0, c)) sum_neg = lambda c: F.sum(F.when(c<0, c)) min = F.min max = F.max avg = F.avg stddev = F.stddev skewness = F.skewness kurtosis = F.kurtosis digits_only = lambda c: F.sum((F.length(F.translate(c, '0123456789', ''))<F.length(c)).cast('int')) spaces_only = lambda c: F.sum(((F.length(F.translate(c, ' \t', ''))==0) & (F.length(c)>0)).cast('int')) all = { 'type': typeof(), 'integer': integer, 'boolean': boolean, 'top3': topn(), 'percentiles': percentiles(), 'null': null, 'zero': zero, 'empty': empty, 'pos': pos, 'neg':neg, 'distinct': distinct, 'sum':sum,
def main(argv): mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( "SC_PHYS_PAGES") # e.g. 4015976448 mem_gib = int((mem_bytes / (1024.0**3)) * 0.9) tar_jar = os.path.join(find_runfiles(), "__main__/galvasr2/spark/tar_spark_datasource.jar") spark = (pyspark.sql.SparkSession.builder.master( f"local[{os.cpu_count() - 1}]").config( "spark.eventLog.enabled", "true").config("spark.eventLog.dir", "/spark-events").config( "spark.sql.execution.arrow.pyspark.enabled", "true").config( "spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config( "spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config("spark.driver.memory", f"{mem_gib}g").config( "spark.history.fs.logDirectory", "/spark-events").config( "spark.sql.execution.arrow.maxRecordsPerBatch", "1").config("spark.jars", tar_jar).config( "spark.local.dir", "/mnt/disks/spark-scratch/").getOrCreate()) spark.sparkContext.setLogLevel("INFO") # "ALL" for very verbose logging logging.getLogger("py4j").setLevel(logging.ERROR) catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue) _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue) licenseurl_df = licenseurl_df.select( [F.col("identifier"), F.col("text_document_id"), F.col("licenseurl")]) # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_" catalogue_df = catalogue_df.withColumn( "kaldi_normalized_uttid", F.concat_ws( "-", F.translate(catalogue_df.identifier, " /", "__"), F.translate(catalogue_df.audio_document_id, " /", "__"), ), ) # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv") if not FLAGS.work_dir.startswith("gs://"): os.makedirs(FLAGS.work_dir, exist_ok=True) wav_scp = os.path.join(FLAGS.work_dir, "wav.scp") ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir") if FLAGS.stage <= 0: catalogue_df = catalogue_df.cache() # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping) training_sample_rows = catalogue_df.collect() catalogue_df.unpersist() with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) create_wav_scp(posix_wav_scp, training_sample_rows, FLAGS.input_dir, ctm_out_dir) # /development/lingvo-source/output_ctm_dir/ # nvprof --analysis-metrics -o decoder-analysis.nvprof \ # We want only the best path, so we set lattice-beam to 0.1 # --main-q-capacity=35000 \ # Can get 266x RTF with this configuration. Keep it? # bath size of 100 and num channels of 100 works just fine if FLAGS.stage <= 1: if not FLAGS.work_dir.startswith("gs://"): os.makedirs(ctm_out_dir, exist_ok=True) with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, ctm_out_dir) posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, FLAGS.work_dir) num_gpus = 4 posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir, num_gpus) executor = ThreadPoolExecutor(max_workers=num_gpus) def run_gpu(posix_wav_scp_shard, gpu_number): cmd = f"""\ /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \ --frame-subsampling-factor=3 \ --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \ --max-active=7000 \ --beam=15.0 \ --lattice-beam=0.1 \ --acoustic-scale=1.0 \ --cuda-decoder-copy-threads=2 \ --cuda-worker-threads={os.cpu_count() // num_gpus} \ --segmentation=true \ --cuda-use-tensor-cores=true \ --max-batch-size=150 \ --num-channels=250 \ --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \ --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \ /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \ /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \ scp,p:{posix_wav_scp_shard} \ {posix_ctm_out_dir} """ env = deepcopy(os.environ) env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}" subprocess.check_call(shlex.split(cmd), env=env) for i, shard in enumerate(posix_wav_scp_shards): executor.submit(run_gpu, shard, i) executor.shutdown(wait=True) alignments_dir = os.path.join(FLAGS.alignments_work_dir, "alignments_json_jul_28") if FLAGS.stage <= 2: # TODO: Add options to DSAlign here dsalign_args = dsalign_main.parse_args( ["--output-wer", "--output-cer"]) # , "--output-sws", "--output-levenshtein"]) alphabet_normalized_path = ( "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt") align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path, 15_000, 3_000) ctm_df = (spark.read.format("binaryFile").option( "pathGlobFilter", "*.ctm").load(ctm_out_dir)) ctm_df = ctm_df.withColumn( "kaldi_normalized_uttid", F.regexp_replace( F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""), ) ctm_df = ctm_df.withColumn("ctm_content", fix_text_udf(F.col("content"))).drop( "path", "length", "modificationTime", "content") ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid") downsampled_catalogue_df = ctm_df.drop("ctm_content") training_sample_rows = downsampled_catalogue_df.collect() transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path, training_sample_rows) transcripts_df = transcripts_df.withColumn( "transcript", normalize_english_text_udf(transcripts_df.transcript)) ctm_df = ctm_df.join(transcripts_df, ["identifier", "text_document_id"]) ctm_df = ctm_df.repartition(960) # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), # F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), # ctm_df.transcript, ctm_df.ctm_content)) alignments_df = ctm_df.withColumn( "alignments", align_udf( F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), ctm_df.transcript, ctm_df.ctm_content, ), ).drop("ctm_content") print("GALVEZ:schema") alignments_df.printSchema() sys.stdout.flush() alignments_df.write.mode("overwrite").format("json").save( alignments_dir) manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest") tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars") if FLAGS.stage <= 3: duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json" duplicates_df = spark.read.format("json").load(duplicate_data_path) alignments_df = spark.read.json(alignments_dir) alignments_df = alignments_df.join( duplicates_df, on=(alignments_df.identifier == duplicates_df.identifier) & (alignments_df.text_document_id == duplicates_df.text_document_id), how="anti", ) if FLAGS.license_filter == "": pass else: if FLAGS.license_filter == "Not CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( ~is_cc_by_sa(F.col("licenseurl"))) elif FLAGS.license_filter == "CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( is_cc_by_sa(F.col("licenseurl"))) else: raise Exception("Unknown license_filter provided.") filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl") alignments_df = alignments_df.join( filtered_licenseurl_df, on=(alignments_df.identifier == filtered_licenseurl_df.identifier) & (alignments_df.text_document_id == filtered_licenseurl_df.text_document_id), how="inner", ) alignments_df = alignments_df.drop( filtered_licenseurl_df.identifier).drop( filtered_licenseurl_df.text_document_id) # We would like the number of partitions to be some large multiple # of the number of executors. Not every audio file is the same # length, so this helps with load balancing. alignments_df = alignments_df.withColumn( "duration_ms", F.expr( "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)" ), ) alignments_df = alignments_df.withColumn( "alignments", F.arrays_zip( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.duration_ms, ).cast( T.ArrayType( T.StructType([ T.StructField("cer", T.FloatType()), T.StructField("end_ms", T.LongType()), T.StructField("label", T.StringType()), T.StructField("start_ms", T.LongType()), T.StructField("wer", T.FloatType()), T.StructField("duration_ms", T.LongType()), ]))), ) alignments_df = alignments_df.drop("duration_ms") alignments_df = alignments_df.withColumn( "alignments", F.filter( alignments_df.alignments, # Need to select this filter such that total number of # hours is 31,400 lambda alignment: (alignment.duration_ms < FLAGS.max_duration_ms) & (alignment.duration_ms >= FLAGS.min_duration_ms) & (alignment.cer < FLAGS.max_cer) & (alignment.cer >= FLAGS.min_cer), ), ) alignments_df = alignments_df.withColumn( "alignments", F.struct( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.alignments.duration_ms, ).cast( T.StructType([ T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("duration_ms", T.ArrayType(T.LongType())), ])), ) alignments_df = alignments_df.repartition(960) abc = alignments_df.select( F.sum( F.expr( "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)" )) / 1000.0 / 60.0 / 60.0).collect() print("GALVEZ:total number of hours=", abc) sys.stdout.flush() alignments_df = alignments_df.select( alignments_df.identifier, alignments_df.audio_document_id, alignments_df.text_document_id, alignments_df.alignments, ) alignments_df = F.broadcast(alignments_df) audio_paths = F.concat( F.lit(FLAGS.input_gcs_path), F.lit("/"), F.col("identifier"), F.lit("/"), F.col("audio_document_id"), ) rows = alignments_df.select(audio_paths).collect() paths = [row[0] for row in rows] # [:1] # GALVEZ: WARNING test! # print(f"number of paths = {len(paths)}") audio_df = (spark.read.format("binaryFile").load(paths).drop( "modificationTime", "length")) alignments_audio_df = alignments_df.join(audio_df, audio_paths == audio_df.path) # from IPython import embed; embed() # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory # Remove "." becasue it has special meaning in webdataset format. # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient) name = F.concat(F.col("identifier"), F.lit("/"), F.col("audio_document_id")) # name = F.regexp_replace(name, r"/", "_SLASH_") name = F.regexp_replace(name, r"\.", "_DOT_") name = F.regexp_replace(name, r" ", "_SPACE_") # glob.glob("**/*.flac") pdf = df.select(name).collect() for name in pdf.name: assert len(name) < 4096 for chunk in "/".split(name): assert len(chunk) < 256 # name = F.regexp_replace(F.concat(F.col("identifier"), # F.lit("-"), # F.col("audio_document_id")), # r"(\.|/)", # "_" # ) # The name of each thing in the tar file. May not exceed 100 characters in length # substr indexes from 1! # name = name.substr( # F.length(name) - F.least(F.length(name), F.lit(88)) + 1, # F.least(F.length(name), F.lit(88)) # ) alignments_audio_df = alignments_audio_df.withColumn( "aligned_chunks", create_audio_segments_udf( alignments_audio_df.content, F.lit("mp3"), name, alignments_audio_df.alignments.start_ms, alignments_audio_df.alignments.end_ms, F.lit("flac"), ), ) a = alignments_audio_df.select( F.explode( F.arrays_zip("aligned_chunks.audio_name", "aligned_chunks.audio"))).select( "col.0", "col.1") a.write.mode("overwrite").format("tar").save(tars_dir) output_df = alignments_audio_df.select( alignments_audio_df.identifier, alignments_audio_df.audio_document_id, alignments_audio_df.text_document_id, F.struct( alignments_audio_df.alignments.label.alias("label"), create_audio_segment_names_udf( # Is F.size right here? name, F.size(alignments_audio_df.alignments.start_ms), F.lit("flac"), ).alias("name"), alignments_audio_df.alignments.duration_ms.alias( "duration_ms"), ).alias("training_data"), ) output_df = output_df.coalesce(960) # coalesce(1) seems to make the create_audio_segments_udf function run serially output_df.write.mode("overwrite").json(manifest_dir) repartitioned_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tars") tmp_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tmp_dir") if FLAGS.stage <= 4: tars_df = spark.read.format("tar").load(tars_dir) # .limit(100) number_of_rows = tars_df.count() spark2 = spark.newSession() spark2.conf.set( "spark.sql.execution.rangeExchange.sampleSizePerPartition", number_of_rows) spark2.conf.set("spark.sql.files.minPartitionNum", FLAGS.number_of_shards) # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100) # print("GALVEZ:", tars_df.select(F.col("key")).collect()) # import sys; sys.exit() tars_df = spark2.read.format("tar").load(tars_dir) # .limit(100) tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards, F.col("key")) # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs. # # tars_df = tars_df.persist() tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir) tars_df = spark2.read.format("tar").load( tmp_tars_dir) # .repartitionByRange() # coalesce(1024) # counts_df = ( # tars_df.withColumn("partitionId", F.spark_partition_id()) # .groupBy("partitionId") # .count() # ) # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0] # # Consider doing this in java # def drop_final_rows(rows): # for _ in range(num_rows_to_keep): # yield next(rows) # for _ in rows: # pass # return # print("GALVEZ:before=", tars_df.rdd.getNumPartitions()) # # , preservesPartitioning=True # tars_df = spark2.createDataFrame( # tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema # ) # print("GALVEZ:after=", tars_df.rdd.getNumPartitions()) # import sys # sys.stdout.flush() # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode. # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir) # manifest_df = spark2.read.json(manifest_dir) # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count() # print(f"GALVEZ:number_of_utterances={number_of_utterances}") # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard) nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo") nemo_single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo_single") if FLAGS.stage <= 5: json_df = spark.read.format("json").load(manifest_dir) nemo_df = json_df.select( F.explode( F.arrays_zip( F.col("training_data.name").alias("audio_filepath"), F.col("training_data.label").alias("text"), F.col("training_data.duration_ms").alias("duration_ms"), ))) nemo_df = nemo_df.select( F.col("col.name").alias("audio_filepath"), F.col("col.label").alias("text"), (F.col("col.duration_ms").cast(T.DoubleType()) / 1000.0).alias("duration"), F.lit(-1).alias("shard_id"), ) if False: tars_df = spark.read.format("tar").load(repartitioned_tars_dir) tars_df = tars_df.select(tars_df.key) nemo_df = F.broadcast(nemo_df) nemo_df = nemo_df.join( tars_df, F.col("audio_filepath") == F.col("key")).drop(F.col("key")) # TODO: Join against tar files that have been made to contain the # same number of files to filter out removed files nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir) nemo_single_df = spark.read.format("json").load(nemo_manifest_dir) nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save( nemo_single_manifest_dir) single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_single") single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single") # Create single tar file and single json file if FLAGS.stage <= 6: json_df = spark.read.format("json").load(manifest_dir) json_df.coalesce(1).write.format("json").mode("overwrite").save( single_manifest_dir) tars_df = spark.read.format("tar").load(tmp_tars_dir) tars_df.coalesce(1).write.format("tar").mode("overwrite").save( single_tar_dir)
# regexp_replace function to replace substitute color names in our description column: from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( col("Description"), regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean")).show(2, False) # COMMAND ---------- # Another task might be to replace given characters with other characters. Spark provides the translate function to replace these values. from pyspark.sql.functions import translate df.select(col("Description"), translate(col("Description"), "LEET", "1337")).show(2) # COMMAND ---------- # Using regex_extract we can pull the matching Strings from the column values. from pyspark.sql.functions import regexp_extract extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" df.select( regexp_extract(col("Description"), extract_str, 1).alias("color_clean"), col("Description")).show(2, False) # COMMAND ---------- # Contains function is to just simply check for the existence of the String in column value.
rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) # COMMAND ---------- from pyspark.sql.functions import translate df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\ .show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_extract extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" df.select( regexp_extract(col("Description"), extract_str, 1).alias("color_clean"), col("Description")).show(2) # COMMAND ---------- from pyspark.sql.functions import instr
from pyspark.sql import SparkSession from pyspark.sql.functions import regexp_replace,col,translate,regexp_extract,instr spark = SparkSession.builder.appName("Pyspark example").getOrCreate() df= spark.read.format("csv").option("header","true").option("inferSchema","true").load("C:/Users/Lenovo/Desktop/spark_data/retail_store.csv") #'regexp_replace' is used to replace substitute color names with NOCOLOR str1="BLACK|WHITE|RED|BLUE|GREEN" df.select(regexp_replace(col("Description"),str1,"NOCOLOR").alias("no_color_column"),col("Description")).show(5) #'translate' function is to replace given characters with other characters df.select(translate(col("Description"),"ABCD","1234"),col("Description")).show(5) #'regexp_extract' is used to extract values df.select(regexp_extract(col("Description"),str1,0).alias("color"),col("Description")).show(5) #'instr' function checks for the existance of a value containsRed= instr(col("Description"),"RED")>=1 containsWhite= instr(col("Description"),"WHITE")>=1 df.withColumn("hasColor",containsWhite| containsRed).where("hasColor").select("Description").show(5)
rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lpad"), rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2) ##정규 표현식 #description컬럼의 값을 COLOR 값으로 치환 from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) #주어진 문자를 다른 문자로 치환 from pyspark.sql.functions import translate df.select(translate(col("Description"), "WHI", "123")).show(2) #color name 추출 from pyspark.sql.functions import regexp_extract extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" df.select( regexp_extract(col("Description"), extract_str, 1).alias("color_clean")).show(6) #data의 존재여부 확인 #instr from pyspark.sql.functions import instr containBlack = instr(col("Description"), "BLACK") >= 1 df.withColumn("HasSimpleColor",containBlack)\ .where("HasSimpleColor")\
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(25) userRecs_1 = userRecs.select(userRecs['userId'],userRecs['recommendations.movieId'].cast("string").alias('movies')) userRecs_2 = userRecs_1.select(userRecs_1['userId'],F.split(userRecs_1.movies,',').getItem(0),F.split(userRecs_1.movies,',').getItem(1),F.split(userRecs_1.movies,',').getItem(2),F.split(userRecs_1.movies,',').getItem(3),F.split(userRecs_1.movies,',').getItem(4),F.split(userRecs_1.movies,',').getItem(5),F.split(userRecs_1.movies,',').getItem(6),F.split(userRecs_1.movies,',').getItem(7),F.split(userRecs_1.movies,',').getItem(8),F.split(userRecs_1.movies,',').getItem(9),F.split(userRecs_1.movies,',').getItem(10),F.split(userRecs_1.movies,',').getItem(11),F.split(userRecs_1.movies,',').getItem(12),F.split(userRecs_1.movies,',').getItem(13),F.split(userRecs_1.movies,',').getItem(14),F.split(userRecs_1.movies,',').getItem(15),F.split(userRecs_1.movies,',').getItem(16),F.split(userRecs_1.movies,',').getItem(17),F.split(userRecs_1.movies,',').getItem(18),F.split(userRecs_1.movies,',').getItem(19),F.split(userRecs_1.movies,',').getItem(20),F.split(userRecs_1.movies,',').getItem(21),F.split(userRecs_1.movies,',').getItem(22),F.split(userRecs_1.movies,',').getItem(23),F.split(userRecs_1.movies,',').getItem(24)) userRecs_2 = userRecs_2.withColumnRenamed("split(movies, ,)[0]", "movieid1").withColumnRenamed("split(movies, ,)[1]","movieid2").withColumnRenamed("split(movies, ,)[2]","movieid3").withColumnRenamed("split(movies, ,)[3]","movieid4").withColumnRenamed("split(movies, ,)[4]","movieid5").withColumnRenamed("split(movies, ,)[5]","movieid6").withColumnRenamed("split(movies, ,)[6]","movieid7").withColumnRenamed("split(movies, ,)[7]","movieid8").withColumnRenamed("split(movies, ,)[8]","movieid9").withColumnRenamed("split(movies, ,)[9]","movieid10").withColumnRenamed("split(movies, ,)[10]","movieid11").withColumnRenamed("split(movies, ,)[11]","movieid12").withColumnRenamed("split(movies, ,)[12]","movieid13").withColumnRenamed("split(movies, ,)[13]","movieid14").withColumnRenamed("split(movies, ,)[14]","movieid15").withColumnRenamed("split(movies, ,)[15]","movieid16").withColumnRenamed("split(movies, ,)[16]","movieid17").withColumnRenamed("split(movies, ,)[17]","movieid18").withColumnRenamed("split(movies, ,)[18]","movieid19").withColumnRenamed("split(movies, ,)[19]","movieid20").withColumnRenamed("split(movies, ,)[20]","movieid21").withColumnRenamed("split(movies, ,)[21]","movieid22").withColumnRenamed("split(movies, ,)[22]","movieid23").withColumnRenamed("split(movies, ,)[23]","movieid24").withColumnRenamed("split(movies, ,)[24]","movieid25") userRecs_2 = userRecs_2.withColumn('movieid1',F.translate('movieid1','[','')) userRecs_2 = userRecs_2.withColumn('movieid25',F.translate('movieid25',']','')) #userRecs_2.printSchema() #userRecs_2.show() #Import dataframe into MySQL #userRecs_2.write.format('jdbc').options(url='jdbc:mysql://us-cdbr-iron-east-05.cleardb.net/heroku_54c3b520208a1ef?useServerPrepStmts=false&rewriteBatchedStatements=true', driver='com.mysql.jdbc.Driver',dbtable='collab_reco',user='******',password='******').mode('append').save() spark.stop()
def __init__(self, spark): self.spark = spark df_raw = spark.read.format("csv").option("delimiter", ",") \ .option("quote", "\"").option("escape", "\"") \ .option("header", "true").option("inferSchema", "true") \ .load("datasetfinaltotal.csv") df_raw1 = df_raw.dropna(how='any') df_raw1.show() #model goi y benh df_raw2 = df_raw1.select('lydo','chandoan', translate(col('lydo'),".;",",,").alias('trieuchung')) df_raw2 = df_raw2.select("trieuchung","chandoan").distinct() df_raw2 = df_raw2.withColumn('trieuchung', explode(split('trieuchung',','))) df_raw3 = df_raw2.select('trieuchung','chandoan').distinct() df_raw3 = df_raw3.withColumn('trieuchung', trim(col('trieuchung'))) df_raw3 = df_raw3.select("trieuchung","chandoan").distinct() df_raw3 = df_raw3.filter(col('trieuchung')!="") df_raw3 = df_raw3.filter(length(regexp_replace("trieuchung", " ", " "))>2) df_raw3.show() pddataframe = df_raw3.toPandas() dfpd = pd.crosstab(pddataframe['trieuchung'], pddataframe['chandoan']) flattened = pd.DataFrame(dfpd.to_records()) flattend1 = flattened.melt(id_vars=["trieuchung"], var_name="chandoan", value_name="rating") df_final = spark.createDataFrame(flattend1) self.df_final = df_final userIndexer = StringIndexer(inputCol='trieuchung',outputCol='trieuchungIndex').fit(df_final) itemIndexer = StringIndexer(inputCol='chandoan',outputCol='chandoanIndex').fit(df_final) pipeline = Pipeline(stages=[userIndexer, itemIndexer]) df_testfinal = pipeline.fit(df_final).transform(df_final) df_testfinal.show() self.df_testfinal = df_testfinal train, test = df_testfinal.randomSplit([0.8,0.2]) self.train = train self.test = test self.__trainmodelgoiybenh() userRecs = self.model.recommendForAllUsers(10) flatUserRecs = userRecs.withColumn("trieuchungandrating",explode(userRecs.recommendations)).select('trieuchungIndex','trieuchungandrating.*') userIndexer = StringIndexer(inputCol='trieuchung',outputCol='trieuchungIndex').fit(self.df_final) itemIndexer = StringIndexer(inputCol='chandoan',outputCol='chandoanIndex').fit(self.df_final) itemConverter = IndexToString(inputCol='chandoanIndex', outputCol='chandoan',labels=itemIndexer.labels) userConverter = IndexToString(inputCol='trieuchungIndex', outputCol='trieuchung', labels=userIndexer.labels) convertedUserRec = Pipeline(stages=[userConverter,itemConverter]).fit(self.df_testfinal).transform(flatUserRecs) self.convertedUserRec = convertedUserRec #mo hinh goi y thuoc df_goiythuoc = df_raw1.select('chandoan','tenhh').distinct() df_goiythuoc.show() pddataframegoiythuoc = df_goiythuoc.toPandas() dfpdgoiythuoc = pd.crosstab(pddataframegoiythuoc['chandoan'], pddataframegoiythuoc['tenhh']) flattenedgoiythuoc = pd.DataFrame(dfpdgoiythuoc.to_records()) flattendgoiythuoc1 = flattenedgoiythuoc.melt(id_vars=["chandoan"], var_name="tenhh", value_name="rating") df_finalgoiythuoc = spark.createDataFrame(flattendgoiythuoc1) userIndexergoiythuoc = StringIndexer(inputCol='chandoan',outputCol='chandoanIndex').fit(df_finalgoiythuoc) itemIndexergoiythuoc = StringIndexer(inputCol='tenhh',outputCol='tenhhIndex').fit(df_finalgoiythuoc) pipeline = Pipeline(stages=[userIndexergoiythuoc, itemIndexergoiythuoc]) df_testfinalgoiythuoc=pipeline.fit(df_finalgoiythuoc).transform(df_finalgoiythuoc) traingoiythuoc, testgoiythuoc = df_testfinalgoiythuoc.randomSplit([0.8,0.2]) self.traingoiythuoc=traingoiythuoc self.testgoiythuoc=testgoiythuoc self.__trainmodelgoiythuoc() userRecsgoiythuoc = self.modelgoiythuoc.recommendForAllUsers(20) flatUserRecsgoiythuoc = userRecsgoiythuoc.withColumn("chuandoanandrating",explode(userRecsgoiythuoc.recommendations)).select('chandoanIndex','chuandoanandrating.*') userConvertergoiythuoc = IndexToString(inputCol='chandoanIndex', outputCol='chandoan',labels=userIndexergoiythuoc.labels) itemConvertergoiythuoc = IndexToString(inputCol='tenhhIndex', outputCol='tenhh',labels=itemIndexergoiythuoc.labels) convertedUserRecgoiythuoc = Pipeline(stages=[userConvertergoiythuoc,itemConvertergoiythuoc]).fit(df_testfinalgoiythuoc).transform(flatUserRecsgoiythuoc) self.convertedUserRecgoiythuoc=convertedUserRecgoiythuoc
def text_clustering(dataFrame, k_value, w2v=False, w2v_value=None, seed=2137, normalize=True, plot=True): """ args: -dataFrame: spark Data Frame -k_value: number of clusters in k-means algorithm -w2v: if True word2Vec is used and w2v_value must be specified, otherwise tf-idf is used -w2v_value: number of parameters to be returned with Word2Vec -seed: seed -normalize: should normalization after Word2Vec be performed? -plot: if True, clusters are visualized with the use of PCA """ #Data preprocessing tokenizer = Tokenizer(inputCol="text", outputCol="words_raw") dataFrame = tokenizer.transform(dataFrame) remover = StopWordsRemover(inputCol="words_raw", outputCol="words") dataFrame = remover.transform(dataFrame) if w2v and w2v_value is None: raise ValueError('You have to give w2v_values parameter') if not w2v: #tf-idf hashingTF = HashingTF(inputCol="words_raw", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(dataFrame) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) memes_df = idfModel.transform(featurizedData) else: #word2vec word2Vec = Word2Vec(vectorSize=w2v_value, seed=seed, inputCol="words", outputCol="features_unnormalized") model_w2v = word2Vec.fit(dataFrame) memes_df = model_w2v.transform(dataFrame) model_w2v.write().overwrite().save("hdfs:///models/model_w2v") if normalize: scaler = StandardScaler(inputCol="features_unnormalized", outputCol="features", withStd=True, withMean=True) scalerModel = scaler.fit(memes_df) memes_df = scalerModel.transform(memes_df) #kmeans kmeans = KMeans(k=k_value, seed=seed) model_kmeans = kmeans.fit(memes_df) memes_df = model_kmeans.transform(memes_df) model_kmeans.write().overwrite().save("hdfs:///models/model_kmeans") #clustering evaluation evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(memes_df) centers = model_kmeans.clusterCenters() if plot: import matplotlib.pyplot as plt #virtual environment might have problems if imported "the classical" way #pca pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures") model_pca = pca.fit(memes_df) memes_df = model_pca.transform(memes_df) #memes_df.show() centers_pca = [None] * len(centers) for i in range(len(centers)): centers_pca[i] = np.multiply(model_pca.pc.toArray().T, centers[i]).sum(axis=1) centers_pca = np.array(centers_pca) #plot section split_col = functions.split(memes_df["pcaFeatures"].cast(StringType()), ',') memes_df = memes_df.withColumn( 'x', translate(split_col.getItem(0), "[", "").cast(DoubleType())) memes_df = memes_df.withColumn( 'y', translate(split_col.getItem(1), "]", "").cast(DoubleType())) #memes_df.show(truncate = False) df = memes_df.toPandas() groups = df.groupby('prediction') fig, ax = plt.subplots() ax.margins(0.05) for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, label=name) ax.text(centers_pca[name, 0], centers_pca[name, 1], s=name, fontsize=10) ax.legend() ax.title.set_text("k={0}, wn={1}, Silhouette={2}".format( k_value, w2v_value, silhouette)) plt.show() print("PCA, explained variance= {0}".format( model_pca.explainedVariance)) return memes_df