示例#1
0
def compile_translate(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    from_str = op.from_str.op().value
    to_str = op.to_str.op().value
    return F.translate(src_column, from_str, to_str)
示例#2
0
def write(tsdf, spark, tabName, optimizationCols = None):
  """
  param: tsdf: input TSDF object to write
  param: tabName Delta output table name
  param: optimizationCols list of columns to optimize on (time)
  """
  # hilbert curves more evenly distribute performance for querying multiple columns for Delta tables
  spark.conf.set("spark.databricks.io.skipping.mdc.curve", "hilbert")

  df = tsdf.df
  ts_col = tsdf.ts_col
  partitionCols = tsdf.partitionCols
  if optimizationCols:
     optimizationCols = optimizationCols + ['event_time']
  else:
     optimizationCols = ['event_time']

  import os
  useDeltaOpt = (os.getenv('DATABRICKS_RUNTIME_VERSION') != None)
  
  view_df = df.withColumn("event_dt", f.to_date(f.col(ts_col))) \
      .withColumn("event_time", f.translate(f.split(f.col(ts_col).cast("string"), ' ')[1], ':', '').cast("double"))
  view_cols = deque(view_df.columns)
  view_cols.rotate(1)
  view_df = view_df.select(*list(view_cols))

  view_df.write.mode("overwrite").partitionBy("event_dt").format('delta').saveAsTable(tabName)

  if useDeltaOpt:
      try:
         spark.sql("optimize {} zorder by {}".format(tabName, "(" + ",".join(partitionCols + optimizationCols) + ")"))
      except Exception as e: 
         print("Delta optimizations attempted, but was not successful.\nError: {}".format(e))
  else:
      print("Delta optimizations attempted on a non-Databricks platform. Switch to use Databricks Runtime to get optimization advantages.")
示例#3
0
 def remove_illegal_chars(self, dataframe: DataFrame, source_column: str,
                          target_column: str):
     df2 = dataframe.select(
         col('id'),
         translate(col(source_column), f'["".join({self.chars})]',
                   self.replacament).alias(target_column))
     return df2.select('id', 'string_filtered')
示例#4
0
def undersores_to_spaces(col):
    """
    Replace undersores with spaces

    :param col: Union[str, Column] A column or a name of a column
    """
    return F.translate(col, "_", " ")
示例#5
0
def read_and_clean_json(spark, input_json_path):
    '''
    Reads json file with products data
    Explodes nested columns
    Selects main columns to dataframe
    Replaces wrong values of character '&'(ampersand)
    Replaces values of character '$' to cast price column to double
    Replaces wrong null values to None
    Casts price column to double
    '''
    df = spark.read.json(input_json_path)
    df = df.withColumn("tmp", arrays_zip("category", "description", "image")) \
        .withColumn("tmp", explode("tmp")) \
        .select("asin", col("tmp.category"), col("tmp.description"), col("tmp.image"), "title", "brand", "main_cat", "price") \
        .withColumn('brand', translate('brand', '&', '&')) \
        .withColumn('category', translate('category', '&', '&')) \
        .withColumn('main_cat', translate('main_cat', '&', '&')) \
        .withColumn('price', regexp_replace('price', '\$', '')) \
        .replace(['null', '', 'None'], None) \
        .withColumn('price', col('price').cast("double"))
    return df
示例#6
0
文件: io.py 项目: ofunkey/tempo
def write(tsdf, spark, tabName, optimizationCols=None):
    """
  param: tsdf: input TSDF object to write
  param: tabName Delta output table name
  param: optimizationCols list of columns to optimize on (time)
  """
    # hilbert curves more evenly distribute performance for querying multiple columns for Delta tables
    spark.conf.set("spark.databricks.io.skipping.mdc.curve", "hilbert")

    df = tsdf.df
    ts_col = tsdf.ts_col
    partitionCols = tsdf.partitionCols
    if optimizationCols:
        optimizationCols = optimizationCols + ['event_time']
    else:
        optimizationCols = ['event_time']

    useDeltaOpt = True
    try:
        dbutils.fs.ls("/")
    except:
        print('Running in local mode')
        useDeltaOpt = False
        pass

    view_df = df.withColumn("event_dt", f.to_date(f.col(ts_col))) \
        .withColumn("event_time", f.translate(f.split(f.col(ts_col).cast("string"), ' ')[1], ':', '').cast("double"))
    view_df.write.mode("overwrite").partitionBy("event_dt").format(
        'delta').saveAsTable(tabName)

    if useDeltaOpt:
        try:
            spark.sql("optimize {} zorder by {}".format(
                tabName,
                "(" + ",".join(partitionCols + optimizationCols) + ")"))
        except:
            print(
                "Delta optimizations attempted on a non-Databricks platform. Switch to use Databricks Runtime to get optimization advantages."
            )
    def clean(self):
        catalog_df = self.spark.read.csv(self.source_path,
                                         inferSchema=True,
                                         header=True,
                                         mode="DROPMALFORMED")
        non_duplicated_content = catalog_df.dropDuplicates(
            ['title', 'director']).orderBy(fc.desc('title'))
        df_netflix_catalog = non_duplicated_content.dropna(
            'any', subset=['title', 'director']).orderBy(fc.asc('title'))
        df_netflix_catalog = df_netflix_catalog.withColumn(
            'title', fc.translate('title', '"', ''))

        df_netflix_catalog = df_netflix_catalog.withColumn(
            'show_id',
            col('show_id').cast(tp.LongType()))
        df_netflix_catalog = df_netflix_catalog.withColumn(
            'release_year',
            col('release_year').cast(tp.IntegerType()))
        df_netflix_catalog = df_netflix_catalog.withColumn(
            'date_added', fc.to_date('date_added', 'MMMMM dd, yyyy'))
        df_netflix_catalog.write.partitionBy(['title', 'director']).bucketBy(
            2, "release_year").parquet(self.destination_path, mode='overwrite')
        print("Clean Catalog Executed")
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # ETL TBHV
    ## Phonetic
    dyf_phonemic = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge", table_name="phonemic")
    dyf_phonemic = dyf_phonemic.select_fields(['id', 'phonemic'])
    df1 = dyf_phonemic.toDF()
    df1.cache()
    df1 = df1.select('phonemic')
    # myArr = np.array(df1.select('phonemic').collect())
    arrPhonetic = [row.phonemic for row in df1.collect()]

    # print('ARR:', arrPhonetic)
    # print('ARR1 :', (u'i:' in arrPhonetic))

    # Custom function
    def doSplitWord(word):
        rs = []
        if word is not None:
            i = 0
            size = len(word)
            while i < size:
                s = word[i:i + 2]
                i += 2
                if s in arrPhonetic:
                    rs.append(s)
                if s not in arrPhonetic:
                    i -= 2
                    s = word[i:i + 1]
                    i += 1
                    if s in arrPhonetic:
                        rs.append(s)

        return rs

    # print('test:', doSplitWord('abcacd'))
    splitWord = udf(lambda x: doSplitWord(x))

    knowledge = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'],
                 ['P02', 'Basic'], ['P03', 'sbasic'], ['P03', 'basic'],
                 ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None],
                 ['L02', None], ['L03', None], ['L04', None], ['L05', None],
                 [None, 'DICTATION'], [None, 'LISTENING']]
    comprehension = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'],
                     ['P02', 'basic'], ['P03', None], ['P03', 'basic'],
                     ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None],
                     ['L02', None], ['L03', None], ['L04', None],
                     ['L05', None], [None, 'DICTATION'], [None, 'LISTENING']]
    application = [['L04', None], ['L04', None], ['L05', None],
                   [None, 'LISTENING']]
    analysis = []
    synthesis = []
    evaluation = []

    state_gradedright = 'gradedright'

    def doAddScore(name, parName, state, type):

        arr = []
        score = 0
        if type == 'knowledge':
            arr = knowledge
        if type == 'comprehension':
            arr = comprehension
        if type == 'application':
            arr = application
        if type == 'analysis':
            arr = analysis
        if type == 'synthesis':
            arr = synthesis
        if type == 'evaluation':
            arr = evaluation

        if state == state_gradedright:
            score = 2
        if state != state_gradedright:
            score = -1

        for x in arr:
            if x[0] is None and x[1] == parName:
                return score
            if x[0] == name and x[1] is None:
                return score
            if x[0] == name and x[1] is not None and x[1].lower(
            ) in parName.lower():
                return score
        return 0

    addScore = udf(doAddScore, IntegerType())

    # print('CHECK:', checkContains('ABCD EFHFF'))

    # chuoi ky tu can replace
    special_str = '["].'

    ######### top_question_attempt_steps
    dyf_top_question_attempt_steps = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_question_attempt_steps_092019")
    dyf_top_question_attempt_steps = dyf_top_question_attempt_steps.select_fields(
        ['_key', 'id', 'questionattemptid', 'state',
         'userid']).rename_field('id', 'steps_id')

    try:
        # # doc moc flag tu s3
        # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts")
        # start_read = df_flag.collect()[0]['flag']
        # print('read from index: ', start_read)
        start_read = 22000000
        end_read = 24000000
        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        dyf_top_question_attempt_steps = Filter.apply(
            frame=dyf_top_question_attempt_steps,
            f=lambda x: x['_key'] >= start_read and x['_key'] < end_read)
    except:
        print('read flag file error ')
    df_temp = dyf_top_question_attempt_steps.toDF()
    df_temp.cache()
    print('COUNT df_temp:', df_temp.count())
    dyf_top_question_attempt_steps = DynamicFrame.fromDF(
        df_temp, glueContext, "dyf_right")
    # print('number of dyf_top_question_attempt_steps: ', dyf_top_question_attempt_steps.count())
    if dyf_top_question_attempt_steps.count() > 0:
        ########## dyf_top_user
        dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="do_top_user")
        dyf_top_user = dyf_top_user.select_fields(['id',
                                                   'student_id']).rename_field(
                                                       'id', 'top_user_id')

        ########## top_question_attempts
        dyf_top_question_attempts = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_question_attempts_092019")
        dyf_top_question_attempts = dyf_top_question_attempts.select_fields(
            ['id', 'rightanswer', 'questionid', 'timemodified'])
        # dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(specs=[('_key', 'cast:long')])
        ######### top_question
        dyf_top_question = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_question")
        dyf_top_question = dyf_top_question.select_fields(
            ['id', 'name', 'category']).rename_field('id', 'quest_id')
        # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')])

        ######### top_question_categories
        dyf_top_question_categories = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_question_categories")
        dyf_top_question_categories = dyf_top_question_categories.select_fields(
            ['id', 'name', 'parent']).rename_field('id', 'quest_cat_id')

        ######### dyf_top_question_categories_parent
        dyf_top_question_categories_parent = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_question_categories")
        dyf_top_question_categories_parent = dyf_top_question_categories_parent.select_fields(
            ['id',
             'name']).rename_field('id',
                                   'par_id').rename_field('name', 'par_name')

        # print("COUNT dyf_top_question_attempts:", dyf_top_question_attempts.count())
        # print("COUNT dyf_top_question:", dyf_top_question.count())
        # print("COUNT dyf_top_question_attempt_steps:", dyf_top_question_attempt_steps.count())
        # print("COUNT dyf_top_question_categories:", dyf_top_question_categories.count())
        # dyf_top_question_attempt_steps = Filter.apply(frame=dyf_top_question_attempt_steps, f=lambda x: x["steps_id"])

        # JOIN va FILTER cac bang theo dieu kien
        dyf_join01 = Join.apply(dyf_top_question_attempt_steps,
                                dyf_top_question_attempts, 'questionattemptid',
                                'id')
        # print("COUNT 1:", dyf_join01.count())
        # dyf_join01.printSchema()
        dyf_join02 = Join.apply(dyf_join01, dyf_top_question, 'questionid',
                                'quest_id')
        # print("COUNT 2:", dyf_join02.count())
        # dyf_join02.printSchema()
        dyf_join03 = Join.apply(dyf_join02, dyf_top_question_categories,
                                'category', 'quest_cat_id')
        dyf_join03 = Join.apply(dyf_join03, dyf_top_question_categories_parent,
                                'parent', 'par_id')

        # print("COUNT dyf_join03 1:", dyf_join03.count())
        # print("COUNT dyf_top_user:"******"COUNT dyf_join03 2:", dyf_join03.count())
        # dyf_join03.printSchema()

        dyf_join03 = dyf_join03.select_fields([
            'student_id', 'rightanswer', 'timemodified', 'state', 'name',
            'parent', 'par_name'
        ])
        arrName = [
            'V01', 'V02', 'V03', 'V04', 'V05', 'G01', 'G02', 'G03', 'G04',
            'G05', 'P01', 'P02', 'P03', 'P04', 'P05'
        ]
        arrParName = ['CONVERSATIONAL_EXPRESSION', 'VOCABULARY', 'READING']
        dyf_join03 = Filter.apply(
            frame=dyf_join03,
            f=lambda x: x["name"] in arrName or x["par_name"] in arrParName)
        # dyf_join03.printSchema()
        # dyf_join03.show()
        # dyf_right = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] == state_gradedright)
        # dyf_wrong = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] != state_gradedright)

        # dyf_join02.show()
        df_right = dyf_join03.toDF()
        # df_right.cache()
        if (df_right.count() > 0):
            try:

                # print("COUNT 1:", df_right.count())
                # Loc cac ky tu dac biet [ ] ",

                # Tach cau thanh array tu:
                # house, her => [house, her]
                df_right = df_right.withColumn(
                    "right_str",
                    f.translate(df_right.rightanswer, special_str, ''))
                df_right = df_right.withColumn(
                    "right_arr", f.split(df_right.right_str, ' '))
                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_right = df_right.withColumn("right",
                                               f.explode(df_right.right_arr))

                # print("COUNT 2:", df_right.count())
                df_right.printSchema()
                dyf_right = DynamicFrame.fromDF(df_right, glueContext,
                                                "dyf_right")
                ## Learning Object
                dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                    database="nvn_knowledge",
                    table_name="nvn_knowledge_learning_object")
                dyf_learning_object = dyf_learning_object.select_fields([
                    'learning_object_id', 'learning_object_name', 'phone_tic'
                ])
                df_learning_object = dyf_learning_object.toDF()
                # replace cac ky tu
                df_learning_object = df_learning_object.withColumn(
                    "phone_tic_new",
                    f.translate(df_learning_object.phone_tic, '\',', ''))

                df_learning_object = df_learning_object.withColumn(
                    "phone_tic_tmp",
                    splitWord(df_learning_object.phone_tic_new))
                df_learning_object = df_learning_object.withColumn(
                    "phone_tic_tmp_01",
                    f.translate(df_learning_object.phone_tic_tmp, '[]', ''))
                df_learning_object = df_learning_object.withColumn(
                    "phone_tic_arr",
                    f.split(df_learning_object.phone_tic_tmp_01, ','))
                df_learning_object = df_learning_object.select(
                    'learning_object_id', 'learning_object_name',
                    'phone_tic_arr')
                dyf_learning_object = DynamicFrame.fromDF(
                    df_learning_object, glueContext, "dyf_learning_object")

                dyf_knowledge_right = Join.apply(dyf_right,
                                                 dyf_learning_object, 'right',
                                                 'learning_object_name')
                dyf_knowledge_right = dyf_knowledge_right.select_fields([
                    'student_id', 'learning_object_id', 'name', 'parent',
                    'timemodified', 'par_name', 'state', 'phone_tic_arr'
                ])

                # print("COUNT 3:", dyf_knowledge_right.count())
                # dyf_knowledge_right.printSchema()
                # dyf_knowledge_right.show()
                # # print("COUNT 4:", dyf_knowledge_wrong.count())
                # # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_right = dyf_knowledge_right.toDF()
                df_knowledge_right.cache()
                df_knowledge_right = df_knowledge_right.withColumn(
                    "right_phonetic",
                    f.explode(df_knowledge_right.phone_tic_arr))
                df_knowledge_right = df_knowledge_right.select(
                    'student_id', 'name', 'timemodified', 'par_name', 'state',
                    'right_phonetic')
                dyf_study_right = DynamicFrame.fromDF(df_knowledge_right,
                                                      glueContext,
                                                      "dyf_study_right")

                dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic,
                                                'right_phonetic', 'phonemic')

                df_knowledge_right = dyf_phonemic_right.toDF()
                df_knowledge_right = df_knowledge_right.withColumn("knowledge", addScore(df_knowledge_right['name'],
                                                                                         df_knowledge_right['par_name'],
                                                                                         df_knowledge_right['state'],
                                                                                         f.lit("knowledge"))) \
                    .withColumn("comprehension", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                          df_knowledge_right['state'], f.lit('comprehension'))) \
                    .withColumn("application", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                        df_knowledge_right['state'], f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                     df_knowledge_right['state'], f.lit('analysis'))) \
                    .withColumn("synthesis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                      df_knowledge_right['state'], f.lit('synthesis'))) \
                    .withColumn("evaluation", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                       df_knowledge_right['state'], f.lit('evaluation'))) \
                    .withColumn("date_id", from_unixtime(df_knowledge_right['timemodified'], 'yyyyMMdd')) \
                    .withColumn("lo_type", f.lit(2))

                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()

                dyf_knowledge_right = DynamicFrame.fromDF(
                    df_knowledge_right, glueContext, "dyf_knowledge_right")
                dyf_knowledge_right = dyf_knowledge_right.resolveChoice(
                    specs=[('lo_type', 'cast:byte')])
                # df_knowledge_right = dyf_knowledge_right.toDF()
                # chon cac truong va kieu du lieu day vao db
                applymapping = ApplyMapping.apply(
                    frame=dyf_knowledge_right,
                    mappings=[("timemodified", "long", "timestart", "long"),
                              ("name", "string", "name", "string"),
                              ("par_name", "string", "par_name", "string"),
                              ("student_id", 'int', 'student_id', 'long'),
                              ("id", "int", "learning_object_id", "int"),
                              ("date_id", "string", "date_id", "long"),
                              ("knowledge", "int", "knowledge", "long"),
                              ("comprehension", "int", "comprehension",
                               "long"),
                              ("application", "int", "application", "long"),
                              ("analysis", "int", "analysis", "long"),
                              ("synthesis", "int", "synthesis", "long"),
                              ("evaluation", "int", "evaluation", "long"),
                              ("phone_tic", "string", "phone_tic", "long"),
                              ("lo_type", "byte", "lo_type", "int")])
                resolvechoice = ResolveChoice.apply(
                    frame=applymapping,
                    choice="make_cols",
                    transformation_ctx="resolvechoice2")
                dropnullfields = DropNullFields.apply(
                    frame=resolvechoice, transformation_ctx="dropnullfields")

                datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
                    frame=dropnullfields,
                    catalog_connection="glue_redshift",
                    connection_options={
                        "dbtable":
                        "t_temp_right_learning_object_02",
                        "database":
                        "dts_odin",
                        "postactions":
                        """ call proc_knowledge_ngu_am_top_question_attempts () """
                    },
                    redshift_tmp_dir="s3n://dts-odin/temp1/",
                    transformation_ctx="datasink5")

                # xoa cache
                # df_right.unpersist()
                # df_knowledge_right.unpersist()
                # df_knowledge_right.unpersist()

                # lay max _key tren datasource
                df_temp = dyf_top_question_attempt_steps.toDF()
                flag = df_temp.agg({"_key": "max"}).collect()[0][0]
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')

                # ghi de flag moi vao s3
                df.write.parquet(
                    "s3a://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts",
                    mode="overwrite")

            except Exception as e:
                print(
                    "###################### Exception ##########################"
                )
                print(e)
示例#9
0
 def tr(self, m, r):
     f = lambda c: F.translate(c, m, r)
     return self.apply(f)
    def process_biomarkers(
        self,
        biomarkers_df: DataFrame,
        source_df: DataFrame,
        disease_df: DataFrame,
        drugs_df: DataFrame
    ) -> DataFrame:
        """The diverse steps to prepare and enrich the input table"""

        biomarkers_enriched = (
            biomarkers_df
            .select(
                'Biomarker', 'IndividualMutation',
                array_distinct(split(col('Alteration'), ';')).alias('alterations'),
                array_distinct(split(col('Gene'), ';')).alias('gene'),
                split(col('AlterationType'), ';').alias('alteration_types'),
                array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'),
                array_distinct(split(col('Drug'), ';|,')).alias('drug'),
                'DrugFullName', 'Association', 'gDNA',
                array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'),
                array_distinct(split(col('Source'), ';')).alias('source')
            )
            .withColumn('confidence', explode(col('confidence')))
            .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name')))
            .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', ''))
            .withColumn('drug', explode(col('drug')))
            .withColumn('drug', translate(col('drug'), '[]', ''))
            .withColumn('gene', explode(col('gene')))
            .replace(to_replace=GENENAMESOVERRIDE, subset=['gene'])
            .withColumn('gene', upper(col('gene')))
            # At this stage alterations and alteration_types are both arrays
            # Disambiguation when the biomarker consists of multiple alterations is needed
            # This is solved by:
            # 1. Zipping both fields - tmp consists of a list of alteration/type tuples
            # 2. tmp is exploded - tmp consists of the alteration/type tuple
            # 3. alteration & alteration_type columns are overwritten with the elements in the tuple
            .withColumn(
                'tmp',
                self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types')))
            .withColumn('tmp', explode(col('tmp')))
            .withColumn('alteration_type', element_at(col('tmp'), 2))
            .withColumn(
                'alteration',
                when(
                    ~col('IndividualMutation').isNull(),
                    col('IndividualMutation')
                )
                .otherwise(element_at(col('tmp'), 1))
            )
            .drop('tmp')
            # Clean special cases on the alteration string
            .withColumn(
                'alteration',
                when(
                    col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.',
                    col('Biomarker')  # 'NRAS (12,13,59,61,117,146)'
                )
                .when(
                    # Cleans strings like 'ARAF:.'
                    col('alteration').contains(':.'),
                    translate(col('alteration'), ':.', '')
                )
                .when(
                    # Fusion genes are described with '__'
                    # biomarker is a cleaner representation when there's one alteration
                    (col('alteration').contains('__')) & (~col('Biomarker').contains('+')),
                    col('Biomarker')
                )
                .otherwise(col('alteration'))
            )
            # Split source into literature and urls
            # literature contains PMIDs
            # urls are enriched from the source table if not a CT
            .withColumn('source', explode(col('source')))
            .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source')))
            .join(source_df, on='source', how='left')
            .withColumn(
                'literature',
                when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2))
            )
            .withColumn(
                'urls',
                when(
                    col('source').startswith('NCT'),
                    struct(
                        lit('Clinical Trials').alias('niceName'),
                        concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url')
                    )
                )
                .when(
                    (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')),
                    struct(col('niceName'), col('url'))
                )
            )
            # The previous conditional clause creates a struct regardless of
            # whether any condition is met. The empty struct is replaced with null
            .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls')))
            # Enrich data
            .withColumn('functionalConsequenceId', col('alteration_type'))
            .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId'])
            .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association'])
            .join(disease_df, on='tumor_type', how='left')
            .withColumn('drug', upper(col('drug')))
            .withColumn(
                # drug class is coalesced when the precise name of the medicine is not provided
                'drug',
                when(col('drug') == '', col('DrugFullName')).otherwise(col('drug')))
            .join(drugs_df, on='drug', how='left')
            .withColumn('drug', initcap(col('drug')))
            # Translate variantId
            .withColumn(
                'variantId',
                when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA')))
            )
            # Assign a GO ID when a gene expression data is reported
            .withColumn(
                'geneExpressionId',
                when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')),
                    'GO_0010628'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')),
                    'GO_0010629'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')),
                    'GO_0010467'
                )
            )
            # Create variant struct
            .withColumn(
                'variant',
                when(
                    col('alteration_type') != 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('variantId').alias('id'),
                        col('functionalConsequenceId')
                    )
                )
            )
            # Create geneExpression struct
            .withColumn(
                'geneExpression',
                when(
                    col('alteration_type') == 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('geneExpressionId').alias('id'))
                )
            )
        )

        pre_evidence = (
            biomarkers_enriched
            .withColumn('datasourceId', lit('cancer_biomarkers'))
            .withColumn('datatypeId', lit('affected_pathway'))
            .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource')
            .withColumnRenamed('drug', 'drugFromSource')
            # diseaseFromSourceMappedId, drugId populated above
            .withColumnRenamed('Association', 'drugResponse')
            # confidence, literature and urls populated above
            .withColumnRenamed('gene', 'targetFromSourceId')
            .withColumnRenamed('Biomarker', 'biomarkerName')
            # variant, geneExpression populated above
            .drop(
                'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId',
                'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url')
        )

        # Group evidence
        self.evidence = (
            pre_evidence
            .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId',
                     'drugResponse', 'targetFromSourceId', 'diseaseFromSource',
                     'diseaseFromSourceMappedId', 'confidence', 'biomarkerName')
            .agg(
                collect_set('literature').alias('literature'),
                collect_set('urls').alias('urls'),
                collect_set('variant').alias('variant'),
                collect_set('geneExpression').alias('geneExpression'),
            )
            # Replace empty lists with null values
            .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature')))
            .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls')))
            .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant')))
            .withColumn(
                'geneExpression',
                when(size(col('geneExpression')) == 0, lit(None))
                .otherwise(col('geneExpression')))
            # Collect variant info into biomarkers struct
            .withColumn(
                'biomarkers',
                struct(
                    'variant',
                    'geneExpression'
                ))
            .drop('variant', 'geneExpression')
            .distinct()
        )

        return self.evidence
# MAGIC
# MAGIC Spark performs predictive analytics using machine learning algorithms.
# MAGIC
# MAGIC The example below trains a linear regression model using past flight data to predict delays based on the hour of the day.

# COMMAND ----------

from pyspark.sql.functions import col, floor, translate, round
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
from pyspark.ml.regression import LinearRegression

inputDF = (spark.read.table("AirlineFlight").withColumn(
    "HourOfDay", floor(col("CRSDepTime") / 100)).withColumn(
        "DepDelay",
        translate(col("DepDelay"), "NA", "0").cast("integer")))

(trainingDF, testDF) = inputDF.randomSplit([0.80, 0.20], seed=999)

pipeline = Pipeline(stages=[
    OneHotEncoder(inputCol="HourOfDay", outputCol="HourVector"),
    VectorAssembler(inputCols=["HourVector"], outputCol="Features"),
    LinearRegression(featuresCol="Features",
                     labelCol="DepDelay",
                     predictionCol="DepDelayPredicted",
                     regParam=0.0)
])

model = pipeline.fit(trainingDF)
resultDF = model.transform(testDF)
示例#12
0
df = df.withColumn('publish_time_2',regexp_replace(df.publish_time_2, 'Z', ''))
df = df.withColumn("publish_time_3", to_timestamp(df.publish_time_2, 'yyyy-MM-dd HH:mm:ss.SSS'))
print(df.printSchema())
df.select("publish_time", "publish_time_2","publish_time_3").show(5,False)
# Notice the .000 on the end of publish_time_new as opposed to publish_time_new_t


# **Translate Function**
# 
# You could also use the Translate function here to do this, where the first set of values is what you are looking for and the second set is what you want to replace those values with respectively. 

# In[40]:


import pyspark.sql.functions as f
df.select("publish_time",f.translate(f.col("publish_time"), "TZ", " ").alias("translate_func")).show(5,False)


# **Trim**
# 
# One common function you've probably seen in almost any data processing tool including excel is the "trim" function which removes leading and trailing white space from a cell in various ways. Let's go ahead and do that with the title field.

# In[41]:


# Trim
# pyspark.sql.functions.trim(col) - Trim the spaces from both ends for the specified string column.
from pyspark.sql.functions import *

df = df.withColumn('title',trim(df.title)) # or rtrim/ltrim
df.select("title").show(5,False)
示例#13
0
######### load vertices
verticesText0 = spark.read.csv(File_verticesTextRDD,
                               header='false',
                               inferSchema='false',
                               sep='\t')

verticesText1 = verticesText0.select("_c1","_c0","_c2","_c3","_c4")\
.withColumnRenamed("_c0", "nodeType").withColumnRenamed("_c1", "id")\
.withColumnRenamed("_c2", "attr1").withColumnRenamed("_c3", "attr2")\
.withColumnRenamed("_c4", "attr3")

######### load edges
edgesText0 = spark.read.csv(File_edgesTextRDD)
edgesText0 = edgesText0.select(
    f.translate(f.col("_c0"), "Edge(", "").alias("src"), "_c1",
    f.translate(f.col("_c2"), ")", "").alias("label"))
edgesText1 = edgesText0.select("*").withColumnRenamed("_c1", "dst")
verticesText1J = verticesText1

edgesText2 = edgesText1.join(
    verticesText1.select("id", "nodeType"),
    edgesText1.src == verticesText1.select("id", "nodeType").id, "inner")
edgesText2 = edgesText2.withColumnRenamed("id", "src_id").withColumnRenamed(
    "nodeType", "src_nodeType")
edgesText3 = edgesText2.join(
    verticesText1.select("id", "nodeType"),
    edgesText2.dst == verticesText1.select("id", "nodeType").id, "inner")
edgesText3 = edgesText3.withColumnRenamed("id", "dst_id").withColumnRenamed(
    "nodeType", "dst_nodeType")
示例#14
0
    .builder \
    .getOrCreate()

# create a dataframe out of it by using the first row as field names and trying to infer a schema based on contents
df = spark.read.option("header", "true").option(
    "inferSchema", "true").csv('noaa-weather-data-jfk-airport/jfk_weather.csv')

# register a corresponding query table. we do this to save the data in memory and run our operations on it.
df.createOrReplaceTempView('df')

# cleaning the data as it contains trailing charcters. Double is a data type like float
# columns with no trailing charecters were straight converrted to double type, rest were first cleaned
df_cleaned = df \
    .withColumn("HOURLYWindSpeed", df.HOURLYWindSpeed.cast('double')) \
    .withColumn("HOURLYWindDirection", df.HOURLYWindDirection.cast('double')) \
    .withColumn("HOURLYStationPressure", translate(col("HOURLYStationPressure"), "s,", "")) \
    .withColumn("HOURLYPrecip", translate(col("HOURLYPrecip"), "s,", "")) \
    .withColumn("HOURLYRelativeHumidity", translate(col("HOURLYRelativeHumidity"), "*", "")) \
    .withColumn("HOURLYDRYBULBTEMPC", translate(col("HOURLYDRYBULBTEMPC"), "*", "")) \

# the cleaned columsn were now chanegd to double types
df_cleaned =   df_cleaned \
                    .withColumn("HOURLYStationPressure", df_cleaned.HOURLYStationPressure.cast('double')) \
                    .withColumn("HOURLYPrecip", df_cleaned.HOURLYPrecip.cast('double')) \
                    .withColumn("HOURLYRelativeHumidity", df_cleaned.HOURLYRelativeHumidity.cast('double')) \
                    .withColumn("HOURLYDRYBULBTEMPC", df_cleaned.HOURLYDRYBULBTEMPC.cast('double')) \

# Filtering for clean data set with no nulls and wind speed not 0
df_filtered = df_cleaned.filter("""
    HOURLYWindSpeed <> 0
    and HOURLYWindSpeed IS NOT NULL
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # thoi gian tu 01/10/2019
    timestamp = 1569888000

    # ETL TBHV
    # Custom function
    def doSplitWord(word):
        size = len(word)
        rs = [word[i:i + 2] for i in range(0, size, 1)]
        rs1 = [word[i:i + 1] for i in range(0, size, 1)]
        rs.extend(rs1)
        return rs

    state_right = 'state_right'
    state_wrong = 'state_wrong'

    # mac dinh duoc cong knowledge
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2
    knowledge = ''
    # cong diem comprehension:
    # Can list cac name duoc cong diem comprehension:
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    comprehension = [
        'P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1',
        'P4_D2'
    ]
    # cong diem application:
    # Can list cac name duoc cong diem application:
    # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    application = [
        'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'
    ]
    # cong diem analysis:
    # Can list cac name duoc cong diem analysis
    # P2_D3; P3_D2; P4_D1; P4_D2
    analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem synthesis:
    # Can list cac name duoc cong diem synthesis
    # P4_D1; P4_D2
    synthesis = ['P4_D1', 'P4_D2']
    # cong diem evaluation:
    # Can list cac name duoc cong diem evaluation
    evaluation = ''

    def doAddScore(name, state, type):
        arr = ['']
        score = 0
        if type == 'comprehension':
            arr = comprehension

        if type == 'application':
            arr = application

        if type == 'analysis':
            arr = analysis

        if type == 'synthesis':
            arr = synthesis

        name = name.lower()
        if state == state_right:
            score = 10
        if state == state_wrong:
            score = -5

        if name is not None:
            for x in arr:
                if x.lower() in name:
                    return score
        return 0

    addScore = udf(doAddScore, IntegerType())

    def doAddScoreAll(plus, minus):
        if plus is None and minus is not None:
            return minus
        if minus is None and plus is not None:
            return plus
        if minus is not None and plus is not None:
            return plus + minus
        return 0

    addScoreAll = udf(doAddScoreAll, IntegerType())

    def do_check_null(val1, val2):
        if val1 is None and val2 is not None:
            return val2
        if val2 is None and val1 is not None:
            return val1
        if val1 is not None and val2 is not None:
            return val1
        return 0

    check_data_null = udf(do_check_null, StringType())

    # chuoi ky tu can replace
    special_str = '["] ;'

    splitWord = udf(lambda x: doSplitWord(x))

    ########## top_quiz_attempts
    dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_quiz_attempts")
    dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(
        ['_key', 'id', 'timestart', 'quiz'])

    dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(
        specs=[('_key', 'cast:long')])

    print dyf_top_quiz_attempts.count()
    dyf_top_quiz_attempts.show(2)

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read)
    # except:
    #     print('read flag file error ')

    dyf_top_quiz_attempts = Filter.apply(
        frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp)

    print dyf_top_quiz_attempts.count()
    dyf_top_quiz_attempts.show()

    if dyf_top_quiz_attempts.count() > 0:
        ########## dyf_top_user
        dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="do_top_user")
        dyf_top_user = dyf_top_user.select_fields(['id',
                                                   'student_id']).rename_field(
                                                       'id', 'top_user_id')
        ######### top_question
        dyf_top_question = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_question")
        dyf_top_question = dyf_top_question.select_fields(
            ['id', 'name']).rename_field('id', 'quest_id')
        # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')])

        ######### top_result_ai
        dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_result_ai")
        dyf_top_result_ai = dyf_top_result_ai.select_fields([
            'question_id', 'attempt_id', 'user_id', 'ratio', 'right_word',
            'wrong_word'
        ])

        # JOIN va FILTER cac bang theo dieu kien
        dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question,
                                'question_id', 'quest_id')
        dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts,
                                'attempt_id', 'id')

        dyf_join02 = Filter.apply(frame=dyf_join02,
                                  f=lambda x: x["quiz"] not in [7, 9, 918])
        dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id',
                                'top_user_id')

        # dyf_join02.show()
        df_study = dyf_join02.toDF()
        df_study.cache()
        if (df_study.count() > 0):
            try:
                # print("COUNT 1:", df_study.count())
                # Loc cac ky tu dac biet [ ] ",
                # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ...
                # df_study = df_study.select(
                #     'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word,
                #                                                                                     special_str, ''), f.translate(df_study.wrong_word,
                #                                        special_str, ''))
                df_study = df_study.select('quiz', 'name', 'student_id',
                                           'timestart', 'right_word',
                                           'wrong_word')
                df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \
                                   .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, ''))

                # Tach cau thanh array tu:
                # house, her => [house, her]
                # PHan tich tu dung
                df_study_right = df_study.withColumn(
                    "right_word_list", f.split(df_study.right_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_right = df_study_right.withColumn(
                    "right", f.explode(df_study_right.right_word_list))
                df_study_right = df_study_right.select('quiz', 'name',
                                                       'student_id',
                                                       'timestart', 'right')
                df_study_right = df_study_right.withColumn(
                    "right", f.lower(f.col("right")))
                # print("COUNT 2:", df_study_right.count())
                # df_study_right.printSchema()
                # df_study_right.show()
                dyf_study_right = DynamicFrame.fromDF(df_study_right,
                                                      glueContext,
                                                      "dyf_study_right")
                ## Learning Object
                dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                    database="nvn_knowledge", table_name="learning_object")
                dyf_learning_object = dyf_learning_object.select_fields(
                    ['learning_object_id', 'learning_object_name'])

                df_learning_object = dyf_learning_object.toDF()
                # convert to lowercase
                df_learning_object = df_learning_object.withColumn(
                    "learning_object_name",
                    f.lower(f.col("learning_object_name")))
                dyf_learning_object = DynamicFrame.fromDF(
                    df_learning_object, glueContext, "dyf_learning_object")

                dyf_knowledge_right = Join.apply(dyf_study_right,
                                                 dyf_learning_object, 'right',
                                                 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_right.count())
                # dyf_knowledge_right.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_right = dyf_knowledge_right.toDF()
                df_knowledge_right.cache()

                df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(10)) \
                        .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \
                        .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \
                        .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \
                        .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \
                        .withColumn("evaluation", f.lit(0)) \
                        .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd'))

                df_knowledge_right = df_knowledge_right.groupby(
                    'student_id', 'date_id', 'learning_object_id').agg(
                        f.count('knowledge').alias("count_plus"),
                        f.sum('knowledge').alias("knowledge_plus"),
                        f.sum('comprehension').alias("comprehension_plus"),
                        f.sum('application').alias("application_plus"),
                        f.sum('analysis').alias("analysis_plus"),
                        f.sum('synthesis').alias("synthesis_plus"),
                        f.sum('evaluation').alias("evaluation_plus"))
                df_knowledge_right = df_knowledge_right.where(
                    'student_id is not null')
                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()

                # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                #
                # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right,
                #                                   mappings=[("timestart", "long", "timestart", "long"),
                #                                             ("student_id", 'int', 'student_id', 'long'),
                #                                             ("learning_object_id", "int", "learning_object_id", "int"),
                #                                             ("date_id", "string", "date_id", "int"),
                #                                             ("knowledge", "int", "knowledge", "int"),
                #                                             ("comprehension", "int", "comprehension", "int"),
                #                                             ("application", "int", "application", "int"),
                #                                             ("analysis", "int", "analysis", "int"),
                #                                             ("synthesis", "int", "synthesis", "int"),
                #                                             ("evaluation", "int", "evaluation", "int")])
                # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                #                                     transformation_ctx="resolvechoice2")
                # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
                #
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "temp_right_wrong_learning_object",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/",
                #                                                            transformation_ctx="datasink5")

                # END Cong diem cac tu dung

                #################################################
                # Tru diem cac tu sai: Xu lu tuong tu tu dung.
                # rule tru diem la -5 diem neu sai

                df_study_wrong = df_study.withColumn(
                    "wrong_word_list", f.split(df_study.wrong_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_wrong = df_study_wrong.withColumn(
                    "wrong", f.explode(df_study_wrong.wrong_word_list))
                #convert to lowercase
                df_study_wrong = df_study_wrong.withColumn(
                    "wrong", f.lower(f.col("wrong")))

                df_study_wrong = df_study_wrong.select('quiz', 'name',
                                                       'student_id',
                                                       'timestart', 'wrong')
                # print("COUNT 2:", df_study_wrong.count())
                # df_study_wrong.printSchema()
                # df_study_wrong.show()

                dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong,
                                                      glueContext,
                                                      "dyf_study_wrong")
                ## Learning Object
                dyf_knowledge_wrong = Join.apply(dyf_study_wrong,
                                                 dyf_learning_object, 'wrong',
                                                 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_wrong = dyf_knowledge_wrong.toDF()
                df_knowledge_wrong.cache()

                df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-5)) \
                    .withColumn("comprehension",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \
                    .withColumn("application",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \
                    .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \
                    .withColumn("evaluation", f.lit(0)) \
                    .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd'))

                df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id',
                                                                'learning_object_id').agg(
                    f.count('knowledge').alias("count_minus"),
                    f.sum('knowledge').alias("knowledge_minus"),
                    f.sum('comprehension').alias("comprehension_minus"),
                    f.sum('application').alias("application_minus"),
                    f.sum('analysis').alias("analysis_minus"),
                    f.sum('synthesis').alias("synthesis_minus"),
                    f.sum('evaluation').alias("evaluation_minus"))\
                    .withColumnRenamed('student_id', 'student_id_wrong') \
                    .withColumnRenamed('date_id', 'date_id_wrong') \
                    .withColumnRenamed('learning_object_id', 'learning_object_id_wrong')

                df_knowledge_wrong = df_knowledge_wrong.where(
                    'student_id_wrong is not null')
                # df_study_all = df_study.select('student_id').withColumnRenamed('student_id', 'student_id_all')

                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()
                df_knowledge = df_knowledge_right.join(
                    df_knowledge_wrong,
                    (df_knowledge_right['student_id']
                     == df_knowledge_wrong['student_id_wrong']) &
                    (df_knowledge_right['date_id']
                     == df_knowledge_wrong['date_id_wrong']) &
                    (df_knowledge_right['learning_object_id']
                     == df_knowledge_wrong['learning_object_id_wrong']),
                    'outer')

                df_knowledge = df_knowledge.withColumn("user_id",
                                check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \
                    .withColumn("learning_object_id",
                                check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \
                    .withColumn("created_date_id",
                                check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \
                    .withColumn("source_system", f.lit('top_result_ai')) \
                    .withColumn("lu_id", f.lit(0))

                dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext,
                                                    "df_knowledge")

                applymapping2 = ApplyMapping.apply(
                    frame=dyf_knowledge,
                    mappings=[
                        ("user_id", 'string', 'student_id', 'long'),
                        ("learning_object_id", "string", "learning_object_id",
                         "long"),
                        # ("knowledge", "int", "knowledge", "long"),
                        # ("comprehension", "int", "comprehension", "long"),
                        # ("application", "int", "application", "long"),
                        # ("analysis", "int", "analysis", "long"),
                        # ("synthesis", "int", "synthesis", "long"),
                        # ("evaluation", "int", "evaluation", "long"),
                        ("knowledge_plus", "long", "knowledge_plus", "long"),
                        ("comprehension_plus", "long", "comprehension_plus",
                         "long"),
                        ("application_plus", "long", "application_plus",
                         "long"),
                        ("analysis_plus", "long", "analysis_plus", "long"),
                        ("synthesis_plus", "long", "synthesis_plus", "long"),
                        ("evaluation_plus", "long", "evaluation_plus", "long"),
                        ("knowledge_minus", "long", "knowledge_minus", "long"),
                        ("comprehension_minus", "long", "comprehension_minus",
                         "long"),
                        ("application_minus", "long", "application_minus",
                         "long"),
                        ("analysis_minus", "long", "analysis_minus", "long"),
                        ("synthesis_minus", "long", "synthesis_minus", "long"),
                        ("evaluation_minus", "long", "evaluation_minus",
                         "long"),
                        ("count_plus", "long", "plus_number", "long"),
                        ("count_minus", "long", "minus_number", "long"),
                        # ("lo_type", "string", "lo_type", "long"),
                        ("source_system", "string", "source_system", "string"),
                        ("created_date_id", "string", "created_date_id",
                         "long"),
                        ("lu_id", "int", "lu_type", "long")
                        # ("student_level", "string", "student_level", "string"),
                        # ("advisor_id", "string", "advisor_id", "long"),
                        # ("package_code", "string", "package_code", "string")
                    ])

                applymapping2.printSchema()
                applymapping2.show(20)

                resolvechoice2 = ResolveChoice.apply(
                    frame=applymapping2,
                    choice="make_cols",
                    transformation_ctx="resolvechoice3")
                dropnullfields2 = DropNullFields.apply(
                    frame=resolvechoice2, transformation_ctx="dropnullfields2")

                print('COUNT df_knowledge: ', dropnullfields2.count())
                dropnullfields2.printSchema()
                dropnullfields2.show(2)

                print('START WRITE TO S3-------------------------')

                datasink6 = glueContext.write_dynamic_frame.from_options(
                    frame=dropnullfields2,
                    connection_type="s3",
                    connection_options={
                        "path":
                        "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/",
                        "partitionKeys": ["created_date_id", "source_system"]
                    },
                    format="parquet",
                    transformation_ctx="datasink6")
                print('END WRITE TO S3-------------------------')
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_result_ai/",
                #                                                            transformation_ctx="datasink5")

                # END Tru diem cac tu sai

                # xoa cache
                df_study.unpersist()
                df_knowledge_right.unpersist()
                df_knowledge_wrong.unpersist()
                # df_knowledge_right.unpersist()
            except Exception as e:
                print(
                    "###################### Exception ##########################"
                )
                print(e)

            # ghi flag
            # lay max key trong data source
            mdl_dyf_top_quiz_attempts = dyf_top_quiz_attempts.toDF()
            flag = mdl_dyf_top_quiz_attempts.agg({
                "_key": "max"
            }).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')

            # ghi de _key vao s3
            df.write.parquet(
                "s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet",
                mode="overwrite")
    .alias("color_cleaned"),
    col("Description"))\
.show(2)

""" = 
SELECT 
    regexp_replace(Description, 'BLACK|WHITE|RED|GREEN|BLUE', 'COLOR') as color_cleaned,
    Description
FROM
    dfTable
"""

#2 replace characters with diferent characters
print("2")
df.select(
    translate(col("Description"), "LEET", "1327"),
    col("Description")
)\
.show(2)

"""
SELECT
    translate(Description, 'LEET', '1327'),
    Description
FROM
    dfTable
"""

#3 pulling out the first mentioned color
print("3")
示例#17
0
        "rowTag", "inproceedings").option('charset',
        "UTF-8").schema(schema).load(unescaped_src.name)
    articles_df = spark.read.format('com.databricks.spark.xml').option("rowTag",
        "article").option('charset', "UTF-8").schema(schema).load(
        unescaped_src.name)
        
    #Almacenamos los jsons
    incollections_df.write.option("charset", "UTF-8").json('./json/incollections')
    inproceedings_df.write.option("charset", "UTF-8").json('./json/inproceedings')
    articles_df.write.option("charset", "UTF-8").json('./json/articles')

    #Agrupamos y almacenamos los csv
    publications_df =   incollections_df.withColumn('LABEL',lit('Incollection')).union(
                        inproceedings_df.withColumn('LABEL',lit('Inproceeding'))).union(
                        articles_df.withColumn('LABEL', lit('Article')))

    publications_df = publications_df.filter(publications_df._key.isNotNull())

    publications_df.withColumn('id', translate('_key', '/', '_')).select('id',
        'title', 'year', 'LABEL').write.option('escape', '"').csv(
        './csv/publications')

    publications_df.withColumn('_author', explode('author._VALUE')).select(
        '_author').write.option('escape', '"').csv('./csv/authors')

    publications_df.withColumn('start', explode('author._VALUE')).withColumn(
        'end', translate('_key', '/', '_')).select('start', 'end').write.option(
            'escape', '"').csv('./csv/rels')

sc.stop()
# COMMAND ----------

# MAGIC %md
# MAGIC ## Fixing Data Types
# MAGIC
# MAGIC Take a look at the schema above. You'll notice that the `price` field got picked up as string. For our task, we need it to be a numeric (double type) field.
# MAGIC
# MAGIC Let's fix that.

# COMMAND ----------

from pyspark.sql.functions import col, translate

fixedPriceDF = baseDF.withColumn(
    "price",
    translate(col("price"), "$,", "").cast("double"))

display(fixedPriceDF)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary statistics
# MAGIC
# MAGIC Two options:
# MAGIC * describe
# MAGIC * summary (describe + IQR)

# COMMAND ----------

display(fixedPriceDF.describe())
示例#19
0
def main(sc):
    """
    Main processing function
    Read in data from PostgreSQL transaction table
    Perform reverse lookup for vin transactions and return input Bitcoin values and addresses
    Perform disjoint set (i.e., union find) algorithm using GraphFrames
    Write out address clustering results to PostgreSQL
    """

    # ---READ IN TRANSACTION DATA AND PERFORM REVERSE TX LOOKUP USING JOINS---

    # create initial SQL query
    # tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {} WHERE height <= 400000 LIMIT 5000000"\
    tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {}"\
        .format(config.SPARK_CONFIG['PG_TX_TABLE'])

    # read in data from PostgreSQL
    tx_df = spark.read \
        .format(config.SPARK_CONFIG['PG_FORMAT']) \
        .option("url", config.SPARK_CONFIG['PG_URL'] + config.SPARK_CONFIG['PG_PORT'] + "/" + config.SPARK_CONFIG['PG_DB']) \
        .option("user", config.SPARK_CONFIG['PG_USER']) \
        .option("password", config.SPARK_CONFIG['PG_PASSWORD'])\
        .option("query", tx_query) \
        .option("numPartitions", '10000') \
        .load()
    # display_df(tx_df)

    # select priority columns, convert array columns, and zip vin and vout fields
    clean_df = tx_df.withColumn("vin_txid_arr", split(col("vin_txid"), ",\s*")) \
        .withColumn("vin_vout_arr", split(col("vin_vout"), ",\s*")) \
        .withColumn("vin_txid_vout_zip", arrays_zip("vin_txid_arr", "vin_vout_arr")) \
        .withColumn("vout_value_arr", split(col("vout_value"), ",\s*")) \
        .withColumn("vout_n_arr", split(col("vout_n"), ",\s*")) \
        .withColumn("vout_addresses_arr", split(col("vout_addresses"), ",\s*")) \
        .withColumn("vout_value_n_addr_zip", arrays_zip("vout_value_arr", "vout_n_arr", "vout_addresses_arr"))
    # display_df(clean_df)

    # # create left side DataFrame
    vin_cols = [
        'txid', 'height', 'time', 'ntx', 'vin_coinbase', 'vin_txid_vout_zip'
    ]
    vin_df = clean_df.select(vin_cols) \
        .withColumn("vin_txid_vout_tup", explode("vin_txid_vout_zip")) \
        .withColumn("vin_txid", col("vin_txid_vout_tup").vin_txid_arr) \
        .withColumn("vin_vout", col("vin_txid_vout_tup").vin_vout_arr) \
        .drop("vin_txid_vout_zip") \
        .drop("vin_txid_vout_tup") \
        .withColumn("left_key", concat(col("vin_txid"), lit("-"), col("vin_vout")))
    # display_df(vin_df)

    # create right side DataFrame
    vout_cols = ['txid', 'vout_value_n_addr_zip']
    vout_df = clean_df.select(vout_cols) \
        .withColumn("vout_value_n_addr_tup", explode("vout_value_n_addr_zip")) \
        .withColumn("vout_value", col("vout_value_n_addr_tup").vout_value_arr) \
        .withColumn("vout_n", col("vout_value_n_addr_tup").vout_n_arr) \
        .withColumn("vout_addr_pre", col("vout_value_n_addr_tup").vout_addresses_arr) \
        .withColumn("vout_addr", translate(col("vout_addr_pre"), '[]', '')) \
        .drop("vout_value_n_addr_zip") \
        .drop("vout_value_n_addr_tup") \
        .drop("vout_addr_pre") \
        .withColumnRenamed("txid", "txid2") \
        .withColumn("right_key", concat(col("txid2"), lit("-"), col("vout_n"))) \
        .drop("txid2")
    # display_df(vout_df)

    # join DataFrames
    join_df = vin_df.join(vout_df, vin_df.left_key == vout_df.right_key, 'left') \
        .drop("left_key") \
        .drop("right_key")
    # display_df(join_df)

    # create temporary table for GraphFrames
    join_df.registerTempTable("join_result")

    # ---CREATING GRAPHFRAME FOR CONNECTED COMPONENTS ALGORITHM---

    # create vertices DataFrame
    vertices = spark.sql(
        "SELECT DISTINCT(vout_addr) FROM join_result").withColumnRenamed(
            "vout_addr", "id")

    # generate DataFrame with single address connection for all addresses in a given txid group
    w = Window.partitionBy("txid").orderBy("vout_addr")
    first_by_txid_df = join_df.withColumn("rn", row_number().over(w)).where(col("rn") == 1) \
        .withColumnRenamed("txid", "txid2") \
        .withColumnRenamed("vout_addr", "vout_addr_first") \
        .drop("rn") \
        .drop("height")
    # first_by_txid_df.show(100)

    # join DataFrames
    interim_df = join_df.join(first_by_txid_df,
                              join_df.txid == first_by_txid_df.txid2, 'left')

    # create edges DataFrame
    edges = interim_df.select("vout_addr", "vout_addr_first") \
        .withColumnRenamed("vout_addr", "src") \
        .withColumnRenamed("vout_addr_first", "dst") \
        .na.drop()

    # create GraphFrame
    g = GraphFrame(vertices, edges)

    # set checkpoint directory in S3
    sc.setCheckpointDir(config.SPARK_CONFIG['S3_CHECKPOINT'])

    # run connected components
    clst_result = g.connectedComponents()
    clst_result.show(100, truncate=False)

    # # ---FOR TESTING ONLY--- show result DataFrame for a specific block to verify correct results
    # clst_result.registerTempTable("clst_table")
    # view_df = spark.sql("SELECT * FROM clst_table ORDER BY clst_table.component")
    # view_df.show(1000, truncate=False)

    # write out to PostgreSQL
    write_clst_to_pg(clst_result)
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # thoi gian tu 01/10/2019
    timestamp = 1569888000

    # ETL TBHV
    ## Phonetic
    dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="learning_object"
    )
    dyf_phonemic = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'phonetic')
    dyf_phonemic = dyf_phonemic.select_fields(['learning_object_id', 'learning_object_name'])
    # Lay ra ngu am
    df1 = dyf_phonemic.toDF()
    df1 = df1.select('learning_object_id', 'learning_object_name')
    # myArr = np.array(df1.select('phonemic').collect())
    arrPhonetic = [row.learning_object_name for row in df1.collect()]
    arrPhoneticId = [[row.learning_object_name, row.learning_object_id] for row in df1.collect()]

    # print('ARR:', arrPhonetic)
    # print('ARR1 :', (u'i:' in arrPhonetic))

    lu_type = []

    # check value for lu_id: valid = 1, invalid = 0
    def doAddLuId(code):
        code = str(code)
        if code is None:
            return 0
        if code not in lu_type:
            return 0
        else:
            return 1

    add_lu_id = udf(doAddLuId, IntegerType())

    def doCheckLyType(plus, minus):
        if plus == 1:
            return plus
        if minus == 1:
            return minus
        return 0

    check_lu_type = udf(doCheckLyType, IntegerType())

    # Custom function
    def doAddScoreAll(plus, minus):
        if plus is None and minus is not None:
            return minus
        if minus is None and plus is not None:
            return plus
        if minus is not None and plus is not None:
            return plus + minus
        return 0

    addScoreAll = udf(doAddScoreAll, IntegerType())

    def do_get_phone_tic_id(phonetic):
        phonetic = phonetic.encode('utf-8', 'replace').strip()
        for x in arrPhoneticId:
            p = x[0].encode('utf-8', 'replace').strip()
            if p == phonetic:
                return x[1]

    get_phone_tic_id = udf(do_get_phone_tic_id, IntegerType())

    def do_check_null(val1, val2):
        if val1 is None and val2 is not None:
            return val2
        if val2 is None and val1 is not None:
            return val1
        if val1 is not None and val2 is not None:
            return val1
        return 0

    check_data_null = udf(do_check_null, StringType())

    def doSplitWord(word):
        rs = []
        if word is not None:
            i = 0
            size = len(word)
            while i < size:
                s = word[i:i + 2]
                i += 2
                if s in arrPhonetic:
                    rs.append(s)
                if s not in arrPhonetic:
                    i -= 2
                    s = word[i:i + 1]
                    i += 1
                    if s in arrPhonetic:
                        rs.append(s)

        return rs

    # print('test:', doSplitWord('abcacd'))
    splitWord = udf(lambda x: doSplitWord(x))

    knowledge = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'], ['P02', 'Basic'], ['P03', 'sbasic'],
                 ['P03', 'basic'], ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None],
                 ['L02', None], ['L03', None], ['L04', None], ['L05', None]]
    comprehension = [['P01', 'sbasic'], ['P01', 'basic'], ['P02', 'sbasic'], ['P02', 'basic'], ['P03', None],
                     ['P03', 'basic'], ['P04', 'sbasic'], ['P04', 'basic'], ['L01', None],
                     ['L02', None], ['L03', None], ['L04', None], ['L05', None]]
    application = [['L04', None], ['L04', None], ['L05', None]]
    analysis = []
    synthesis = []
    evaluation = []

    state_gradedright = 'gradedright'

    def doAddScore(name, parName, state, type):

        arr = []
        score = 0
        if type == 'knowledge':
            arr = knowledge
        if type == 'comprehension':
            arr = comprehension
        if type == 'application':
            arr = application
        if type == 'analysis':
            arr = analysis
        if type == 'synthesis':
            arr = synthesis
        if type == 'evaluation':
            arr = evaluation

        if state is not None and state == state_gradedright:
            score = 2
        else:
            score = -1

        for x in arr:
            if x[0] is None and x[1] == parName:
                return score
            if x[0] == name and x[1] is None:
                return score
            if x[0] == name and x[1] is not None and x[1].lower() in parName.lower():
                return score
        return 0

    addScore = udf(doAddScore, IntegerType())

    # print('CHECK:', checkContains('ABCD EFHFF'))

    # chuoi ky tu can replace
    special_str = '["].'

    ########## top_question_attempts
    dyf_top_question_attempts = glueContext.create_dynamic_frame.from_catalog(
        database="moodle",
        table_name="top_question_attempts"
    )
    dyf_top_question_attempts = dyf_top_question_attempts.select_fields(
        ['_key', 'id', 'rightanswer', 'questionid', 'questionusageid', 'timemodified'])
    dyf_top_question_attempts = dyf_top_question_attempts.resolveChoice(specs=[('_key', 'cast:long')])

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_top_question_attempts = Filter.apply(frame=dyf_top_question_attempts, f=lambda x: x['_key'] > start_read)
    # except:
    #     print('read flag file error ')

    print('number of dyf_top_question_attempts: ', dyf_top_question_attempts.count())
    if dyf_top_question_attempts.count() > 0:
        ########## dyf_top_user
        dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="do_top_user"
        )

        dyf_top_user = dyf_top_user.select_fields(
            ['id', 'student_id']).rename_field('id', 'top_user_id')

        ######### top_quiz_attempts
        dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_quiz_attempts"
        )
        dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(
            ['userid', 'uniqueid'])

        ######### top_question_attempt_steps
        dyf_top_question_attempt_steps = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_question_attempt_steps"
        )
        dyf_top_question_attempt_steps = dyf_top_question_attempt_steps.select_fields(
            ['id', 'questionattemptid', 'state']).rename_field('id', 'steps_id')

        print dyf_top_question_attempts.count()
        dyf_top_question_attempts.show(2)

        dyf_top_question_attempts = Filter.apply(frame=dyf_top_question_attempts,
                                                 f=lambda x: x["timemodified"] >= timestamp)

        print dyf_top_question_attempts.count()
        dyf_top_question_attempts.show()

        ######### top_question
        dyf_top_question = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_question"
        )
        dyf_top_question = dyf_top_question.select_fields(
            ['id', 'name', 'category']).rename_field('id', 'quest_id')
        # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')])

        ######### top_question_categories
        dyf_top_question_categories = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_question_categories"
        )
        dyf_top_question_categories = dyf_top_question_categories.select_fields(
            ['id', 'name', 'parent']).rename_field('id', 'quest_cat_id')

        ######### dyf_top_question_categories_parent
        dyf_top_question_categories_parent = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_question_categories"
        )
        dyf_top_question_categories_parent = dyf_top_question_categories_parent.select_fields(
            ['id', 'name']).rename_field('id', 'par_id').rename_field('name', 'par_name')

        # print("COUNT dyf_top_question_attempts:", dyf_top_question_attempts.count())
        # print("COUNT dyf_top_question:", dyf_top_question.count())
        # print("COUNT dyf_top_question_attempt_steps:", dyf_top_question_attempt_steps.count())
        # print("COUNT dyf_top_question_categories:", dyf_top_question_categories.count())
        # dyf_top_question_attempt_steps = Filter.apply(frame=dyf_top_question_attempt_steps, f=lambda x: x["steps_id"])

        # JOIN va FILTER cac bang theo dieu kien
        dyf_join = Join.apply(dyf_top_question_attempts, dyf_top_quiz_attempts, 'questionusageid', 'uniqueid')

        dyf_top_question_attempt_steps = Filter.apply(frame=dyf_top_question_attempt_steps,
                                                      f=lambda x: x["state"] == state_gradedright)
        df_top_question_attempt_steps = dyf_top_question_attempt_steps.toDF()
        df_join = dyf_join.toDF()
        df_join01 = df_join.join(df_top_question_attempt_steps,
                                 (df_join['id'] == df_top_question_attempt_steps['questionattemptid']), 'left')

        dyf_join01 = DynamicFrame.fromDF(df_join01, glueContext, "dyf_join01")
        # dyf_join01 = Join.apply(dyf_top_question_attempt_steps, dyf_top_question_attempts, 'questionattemptid', 'id')
        # print("COUNT 1:", dyf_join01.count())
        # dyf_join01.printSchema()
        dyf_join02 = Join.apply(dyf_join01, dyf_top_question, 'questionid', 'quest_id')
        # print("COUNT 2:", dyf_join02.count())
        # dyf_join02.printSchema()
        dyf_join03 = Join.apply(dyf_join02, dyf_top_question_categories, 'category', 'quest_cat_id')
        dyf_join03 = Join.apply(dyf_join03, dyf_top_question_categories_parent, 'parent', 'par_id')
        dyf_join03 = Join.apply(dyf_join03, dyf_top_user, 'userid', 'top_user_id')
        # print("COUNT 3:", dyf_join03.count())

        dyf_join03.printSchema()

        dyf_join03 = dyf_join03.select_fields(
            ['student_id', 'rightanswer', 'timemodified', 'state', 'name', 'parent', 'par_name'])
        # dyf_join03.printSchema()
        # dyf_join03.show()
        # dyf_right = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] == state_gradedright)
        # dyf_wrong = Filter.apply(frame=dyf_join03, f=lambda x: x["state"] != state_gradedright)

        # dyf_join02.show()
        df_right = dyf_join03.toDF()
        df_right.cache()
        if (df_right.count() > 0):
            try:

                # print("COUNT 1:", df_right.count())
                # Loc cac ky tu dac biet [ ] ",

                # Tach cau thanh array tu:
                # house, her => [house, her]
                df_right = df_right.withColumn("right_str", f.translate(df_right.rightanswer, special_str, ''))
                df_right = df_right.withColumn("right_arr", f.split(df_right.right_str, ' '))
                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_right = df_right.withColumn("right",
                                               f.explode(df_right.right_arr))

                # print("COUNT 2:", df_right.count())
                # df_right.printSchema()
                dyf_right = DynamicFrame.fromDF(df_right, glueContext, "dyf_right")
                ## Learning Object: loc lay dang tu vung de doc ngu am
                dyf_learning_object = Filter.apply(frame=dyf_learning_object,
                                                   f=lambda x: x["learning_object_type"] == 'vocabulary')
                dyf_learning_object = dyf_learning_object.select_fields(
                    ['learning_object_id', 'learning_object_name', 'transcription'])
                df_learning_object = dyf_learning_object.toDF()
                # replace cac ky tu
                df_learning_object = df_learning_object.withColumn("phone_tic_new",
                                                                   f.translate(df_learning_object.transcription, '\',', ''))

                df_learning_object = df_learning_object.withColumn("phone_tic_tmp",
                                                                   splitWord(df_learning_object.phone_tic_new))
                df_learning_object = df_learning_object.withColumn("phone_tic_tmp_01",
                                                                   f.translate(df_learning_object.phone_tic_tmp, '[]', ''))
                df_learning_object = df_learning_object.withColumn("phone_tic_arr",
                                                                   f.split(df_learning_object.phone_tic_tmp_01, ','))
                df_learning_object = df_learning_object.select('learning_object_id', 'learning_object_name',
                                                               'phone_tic_arr')
                dyf_learning_object = DynamicFrame.fromDF(df_learning_object, glueContext, "dyf_learning_object")

                dyf_knowledge_right = Join.apply(dyf_right, dyf_learning_object, 'right', 'learning_object_name')
                dyf_knowledge_right = dyf_knowledge_right.select_fields(
                    ['student_id', 'learning_object_id', 'name', 'parent', 'timemodified', 'par_name', 'state',
                     'phone_tic_arr'])

                # print("COUNT 3:", dyf_knowledge_right.count())
                # dyf_knowledge_right.printSchema()
                # dyf_knowledge_right.show()
                # # print("COUNT 4:", dyf_knowledge_wrong.count())
                # # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_right = dyf_knowledge_right.toDF()
                df_knowledge_right = df_knowledge_right.withColumn("right_phonetic",
                                                                   f.explode(df_knowledge_right.phone_tic_arr))
                df_knowledge_right = df_knowledge_right.select('student_id', 'name', 'timemodified', 'par_name', 'state',
                                                               'right_phonetic')
                df_knowledge_right = df_knowledge_right.withColumn("learning_object_id",
                                                                   get_phone_tic_id(df_knowledge_right.right_phonetic))

                # dyf_study_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_study_right")

                # dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'right_phonetic', 'learning_object_name')

                # df_knowledge_right = dyf_phonemic_right.toDF()
                df_knowledge_right = df_knowledge_right.withColumn("knowledge", addScore(df_knowledge_right['name'],
                                                                                         df_knowledge_right['par_name'],
                                                                                         df_knowledge_right['state'],
                                                                                         f.lit("knowledge"))) \
                    .withColumn("comprehension", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                          df_knowledge_right['state'], f.lit('comprehension'))) \
                    .withColumn("application", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                        df_knowledge_right['state'], f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                     df_knowledge_right['state'], f.lit('analysis'))) \
                    .withColumn("synthesis", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                      df_knowledge_right['state'], f.lit('synthesis'))) \
                    .withColumn("evaluation", addScore(df_knowledge_right['name'], df_knowledge_right['par_name'],
                                                       df_knowledge_right['state'], f.lit('evaluation'))) \
                    .withColumn("date_id", from_unixtime(df_knowledge_right['timemodified'], 'yyyyMMdd')) \
                    .withColumn("lo_type", f.lit(2))

                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()
                df_knowledge_right.cache()
                # History
                # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                # dyf_knowledge_right = dyf_knowledge_right.resolveChoice(specs=[('lo_type', 'cast:int')])
                # df_knowledge_right = dyf_knowledge_right.toDF()
                # chon cac truong va kieu du lieu day vao db

                # dyf_ai_history_plus = Filter.apply(frame=dyf_knowledge_right, f=lambda x: x['knowledge'] > 0)
                #
                # dyf_ai_history_minus = Filter.apply(frame=dyf_knowledge_right, f=lambda x: x['knowledge'] < 0)

                df_ai_history_plus = df_knowledge_right.where('knowledge > 0')

                df_ai_history_plus = df_ai_history_plus.groupby('student_id', 'learning_object_id', 'date_id').agg(
                    f.count("student_id").alias("count_plus"), f.sum("knowledge").alias("knowledge_plus"),
                    f.sum("comprehension").alias("comprehension_plus"), f.sum("application").alias("application_plus"),
                    f.sum("analysis").alias("analysis_plus"), f.sum("synthesis").alias("synthesis_plus"),
                    f.sum("evaluation").alias("evaluation_plus"))
                df_ai_history_plus = df_ai_history_plus.withColumnRenamed('student_id', 'student_id_plus') \
                    .withColumnRenamed('learning_object_id', 'learning_object_id_plus') \
                    .withColumnRenamed('date_id', 'date_id_plus')
                    # .withColumnRenamed('lu_type', 'lu_type_plus')

                df_ai_history_plus = df_ai_history_plus.where('student_id_plus is not null')

                # dyf_ai_history_plus = DynamicFrame.fromDF(df_ai_history_plus, glueContext, "dyf_ai_history_plus")
                #
                # dyf_ai_history_plus = dyf_ai_history_plus.select_fields(
                #     ['date_id', 'student_id', 'learning_object_id', 'lo_type', 'knowledge_plus', 'comprehension_plus',
                #      'application_plus', 'analysis_plus',
                #      'synthesis_plus', 'evaluation_plus', 'count_plus']).rename_field('student_id',
                #                                                                       'student_id_plus').rename_field(
                #     'date_id', 'date_id_plus').rename_field('lo_type', 'lo_type_plus').rename_field('id',
                #                                                                                     'learning_object_id_plus')

                df_ai_history_minus = df_knowledge_right.where('knowledge < 0')
                df_ai_history_minus = df_ai_history_minus.groupby('student_id', 'learning_object_id', 'date_id').agg(
                    f.count("student_id").alias("count_minus"),
                    f.sum("knowledge").alias("knowledge_minus"),
                    f.sum("comprehension").alias("comprehension_minus"),
                    f.sum("application").alias("application_minus"),
                    f.sum("analysis").alias("analysis_minus"),
                    f.sum("synthesis").alias("synthesis_minus"),
                    f.sum("evaluation").alias("evaluation_minus"))
                df_ai_history_minus = df_ai_history_minus.withColumnRenamed('student_id', 'student_id_minus') \
                    .withColumnRenamed('learning_object_id', 'learning_object_id_minus') \
                    .withColumnRenamed('date_id', 'date_id_minus')
                    # .withColumnRenamed('lu_type', 'lu_type_minus')

                df_ai_history_minus = df_ai_history_minus.where('student_id_minus is not null')
                print("AAAAAAAAAAAAAAAAAAAAAAAA")

                # dyf_ai_history_minus = DynamicFrame.fromDF(df_ai_history_minus, glueContext, "dyf_ai_history_plus")
                # dyf_ai_history_minus = dyf_ai_history_minus.select_fields(
                #     ['date_id', 'student_id', 'id', 'lo_type', 'knowledge_minus', 'comprehension_minus',
                #      'application_minus', 'analysis_minus', 'synthesis_minus', 'evaluation_minus',
                #      'count_minus']).rename_field('student_id', 'user_id_minus').rename_field(
                #     'date_id', 'date_id_minus').rename_field('lo_type', 'lo_type_minus').rename_field('id',
                #                                                                                       'learning_object_id_minus')

                # dyf_ai_history_minus.printSchema()
                # dyf_ai_history_minus.show(2)
                # dyf_ai_history_plus.printSchema()
                # dyf_ai_history_plus.show(2)

                print ("###########################################")
                # df_ai_history_minus = dyf_ai_history_minus.toDF()
                # df_ai_history_plus = dyf_ai_history_plus.toDF()
                df_join_history = df_ai_history_plus.join(df_ai_history_minus, (
                            df_ai_history_plus['student_id_plus'] == df_ai_history_minus['student_id_minus']) &
                                                          (df_ai_history_plus['date_id_plus'] == df_ai_history_minus[
                                                              'date_id_minus']) &
                                                          (df_ai_history_plus['learning_object_id_plus'] ==
                                                           df_ai_history_minus['learning_object_id_minus']), 'outer')

                df_join_history = df_join_history.withColumn("created_date_id", check_data_null(df_join_history.date_id_plus, df_join_history.date_id_minus)) \
                    .withColumn("user_id",
                                check_data_null(df_join_history.student_id_plus, df_join_history.student_id_minus)) \
                    .withColumn("source_system", f.lit("top_question_attempt_phonetic")) \
                    .withColumn("learning_object_id", check_data_null(df_join_history.learning_object_id_plus,
                                                                      df_join_history.learning_object_id_minus)) \
                    .withColumn("lu_id", f.lit(0))
                    # .withColumn("lu_id", check_lu_type(df_join_history.lu_type_plus, df_join_history.lu_type_minus))
                join_history = DynamicFrame.fromDF(df_join_history, glueContext, 'join_history')
                # join_history.printSchema()
                # join_history.printSchema()
                ################
                applymapping1 = ApplyMapping.apply(frame=join_history,
                                                   mappings=[("user_id", 'string', 'student_id', 'long'),
                                                             ("learning_object_id", "string", "learning_object_id", "long"),
                                                             # ("knowledge", "int", "knowledge", "long"),
                                                             # ("comprehension", "int", "comprehension", "long"),
                                                             # ("application", "int", "application", "long"),
                                                             # ("analysis", "int", "analysis", "long"),
                                                             # ("synthesis", "int", "synthesis", "long"),
                                                             # ("evaluation", "int", "evaluation", "long"),
                                                             ("knowledge_plus", "long", "knowledge_plus", "long"),
                                                             ("comprehension_plus", "long", "comprehension_plus", "long"),
                                                             ("application_plus", "long", "application_plus", "long"),
                                                             ("analysis_plus", "long", "analysis_plus", "long"),
                                                             ("synthesis_plus", "long", "synthesis_plus", "long"),
                                                             ("evaluation_plus", "long", "evaluation_plus", "long"),
                                                             ("knowledge_minus", "long", "knowledge_minus", "long"),
                                                             ("comprehension_minus", "long", "comprehension_minus", "long"),
                                                             ("application_minus", "long", "application_minus", "long"),
                                                             ("analysis_minus", "long", "analysis_minus", "long"),
                                                             ("synthesis_minus", "long", "synthesis_minus", "long"),
                                                             ("evaluation_minus", "long", "evaluation_minus", "long"),
                                                             ("count_plus", "long", "plus_number", "long"),
                                                             ("count_minus", "long", "minus_number", "long"),
                                                             # ("lo_type", "string", "lo_type", "long"),
                                                             ("source_system", "string", "source_system", "string"),
                                                             ("created_date_id", "string", "created_date_id", "long"),
                                                             ("lu_id", "int", "lu_type", "long")
                                                             # ("student_level", "string", "student_level", "string"),
                                                             # ("advisor_id", "string", "advisor_id", "long"),
                                                             # ("package_code", "string", "package_code", "string")
                                                             ])
                resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols",
                                                     transformation_ctx="resolvechoice1")
                dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields")

                print(dropnullfields1.count())
                dropnullfields1.show(5)

                print('START WRITE TO S3-------------------------')
                datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3",
                                                                         connection_options={
                                                                             "path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/"},
                                                                         format="parquet",
                                                                         transformation_ctx="datasink6")
                print('END WRITE TO S3-------------------------')

                datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
                                                                           catalog_connection="glue_redshift",
                                                                           connection_options={
                                                                               "dbtable": "mapping_lo_student_history",
                                                                               "database": "dts_odin"
                                                                           },
                                                                           redshift_tmp_dir="s3n://dts-odin/temp1/mapping_lo_student_history/",
                                                                           transformation_ctx="datasink5")


                # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                # dyf_knowledge_right = dyf_knowledge_right.resolveChoice(specs=[('lo_type', 'cast:byte')])
                # # df_knowledge_right = dyf_knowledge_right.toDF()
                # # chon cac truong va kieu du lieu day vao db
                # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right,
                #                                   mappings=[("timemodified", "long", "timestart", "long"),
                #                                             ("name", "string", "name", "string"),
                #                                             ("par_name", "string", "par_name", "string"),
                #                                             ("student_id", 'int', 'student_id', 'long'),
                #                                             ("learning_object_id", "long", "learning_object_id", "int"),
                #                                             ("date_id", "string", "date_id", "long"),
                #                                             ("knowledge", "int", "knowledge", "long"),
                #                                             ("comprehension", "int", "comprehension", "long"),
                #                                             ("application", "int", "application", "long"),
                #                                             ("analysis", "int", "analysis", "long"),
                #                                             ("synthesis", "int", "synthesis", "long"),
                #                                             ("evaluation", "int", "evaluation", "long"),
                #                                             ("phone_tic", "string", "phone_tic", "long"),
                #                                             ("lo_type", "byte", "lo_type", "int")])
                # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                #                                     transformation_ctx="resolvechoice2")
                # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")



                # xoa cache
                df_right.unpersist()
                df_knowledge_right.unpersist()
                # df_knowledge_right.unpersist()

                # lay max _key tren datasource
                df_temp = dyf_top_question_attempts.toDF()
                flag = df_temp.agg({"_key": "max"}).collect()[0][0]
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')

                # ghi de flag moi vao s3
                df.write.parquet("s3a://dts-odin/flag/flag_knowledge_ngu_am_top_quest_attempts", mode="overwrite")

            except Exception as e:
                print("###################### Exception ##########################")
                print(e)
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # thoi gian tu 01/10/2019
    timestamp = 1569888000

    ## Phonetic
    dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="learning_object"
    )
    dyf_phonemic = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'phonetic')
    dyf_phonemic = dyf_phonemic.select_fields(['learning_object_id', 'learning_object_name'])
    # df_phonemic = dyf_phonemic.toDF()
    # df_phonemic = df_phonemic.withColumn('lo_name', convertedudf(df_phonemic.learning_object_name))
    # df_phonemic.show()
    # Lay ra ngu am
    df1 = dyf_phonemic.toDF()
    df1 = df1.select('learning_object_id', 'learning_object_name')
    # myArr = np.array(df1.select('phonemic').collect())
    arrPhonetic = [row.learning_object_name for row in df1.collect()]
    arrPhoneticId = [[row.learning_object_name, row.learning_object_id] for row in df1.collect()]
    # print(unicode(arrPhonetic[2]))
    # print('ARR:', arrPhonetic)
    # print('ARR:', arrPhonetic[2].encode('utf-8', 'replace'))
    # print('ARR1 :', (u'i:' in arrPhonetic))

    # ETL TBHV
    # Custom function


    def doAddScoreAll(plus, minus):
        if plus is None and minus is not None:
            return minus
        if minus is None and plus is not None:
            return plus
        if minus is not None and plus is not None:
            return plus + minus
        return 0

    addScoreAll = udf(doAddScoreAll, IntegerType())

    def do_get_phone_tic_id(phonetic):
        phonetic = phonetic.encode('utf-8', 'replace').strip()
        for x in arrPhoneticId:
            p = x[0].encode('utf-8', 'replace').strip()
            if p == phonetic:
                return x[1]

    get_phone_tic_id = udf(do_get_phone_tic_id, IntegerType())

    def do_check_null(val1, val2):
        if val1 is None and val2 is not None:
            return val2
        if val2 is None and val1 is not None:
            return val1
        if val1 is not None and val2 is not None:
            return val1
        return 0

    check_data_null = udf(do_check_null, StringType())

    def doSplitWord(word):
        rs = []
        if word is not None:
            i = 0
            size = len(word)
            while i < size:
                s = word[i:i + 2]
                i += 2
                if s in arrPhonetic:
                    rs.append(s)
                if s not in arrPhonetic:
                    i -= 2
                    s = word[i:i + 1]
                    i += 1
                    if s in arrPhonetic:
                        rs.append(s)

        return rs

    splitWord = udf(lambda x: doSplitWord(x))

    state_right = 'state_right'
    state_wrong = 'state_wrong'

    # mac dinh duoc cong knowledge
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2
    # knowledge = []
    # cong diem comprehension:
    # Can list cac name duoc cong diem comprehension:
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    comprehension = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem application:
    # Can list cac name duoc cong diem application:
    # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    application = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem analysis:
    # Can list cac name duoc cong diem analysis
    # P2_D3; P3_D2; P4_D1; P4_D2
    analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem synthesis:
    # Can list cac name duoc cong diem synthesis
    # P4_D1; P4_D2
    synthesis = []
    # cong diem evaluation:
    # Can list cac name duoc cong diem evaluation
    evaluation = []

    def doAddScore(name, state, type):
        arr = ['']
        score = 0
        if type == 'comprehension':
            arr = comprehension

        if type == 'application':
            arr = application

        if type == 'analysis':
            arr = analysis

        if type == 'synthesis':
            arr = synthesis

        name = name.lower()
        if state == state_right:
            score = 2
        if state == state_wrong:
            score = -1

        if name is not None:
            for x in arr:
                if x.lower() in name:
                    return score
        return 0

    addScore = udf(doAddScore, IntegerType())

    # chuoi ky tu can replace
    special_str = '["] ;'

    ########## top_quiz_attempts
    dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(
        database="moodle",
        table_name="top_quiz_attempts"
    )
    dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(['_key', 'id', 'timestart', 'quiz'])

    dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(specs=[('_key', 'cast:long')])

    # print dyf_top_quiz_attempts.count()
    # dyf_top_quiz_attempts.show(2)

    dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts,
                                         f=lambda x: x["timestart"] >= timestamp)

    # print dyf_top_quiz_attempts.count()
    # dyf_top_quiz_attempts.show()

    # xu ly truong hop start_read is null
    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read)
    # except:
    #     print('read flag file error ')

    # print('the number of new contacts: ', dyf_top_quiz_attempts.count())

    if dyf_top_quiz_attempts.count() > 0:
        ########## dyf_top_user
        dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="do_top_user"
        )
        dyf_top_user = dyf_top_user.select_fields(
            ['id', 'student_id']).rename_field('id', 'top_user_id')
        ######### top_question
        dyf_top_question = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_question"
        )
        dyf_top_question = dyf_top_question.select_fields(
            ['id', 'name'])
        # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')])

        ######### top_result_ai
        dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog(
            database="moodle",
            table_name="top_result_ai"
        )
        dyf_top_result_ai = dyf_top_result_ai.select_fields(
            ['question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word'])

        # JOIN va FILTER cac bang theo dieu kien
        dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'id')
        dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id')
        dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918])
        dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id')

        # dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["student_id"] == 259442)

        # dyf_join02.show()
        df_study = dyf_join02.toDF()
        df_study.cache()
        if (df_study.count() > 0):
            try:

                # print("COUNT 1:", df_study.count())
                # Loc cac ky tu dac biet [ ] "
                # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ...
                # df_study = df_study.select(
                #     'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word,
                #                                                                                     special_str, ''), f.translate(df_study.wrong_word,
                #                                        special_str, ''))
                df_study = df_study.select(
                    'quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word')
                df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \
                    .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, ''))

                # Tach cau thanh array tu:
                # house, her => [house, her]
                # PHan tich tu dung
                df_study_right = df_study.withColumn("right_word_list", f.split(
                    df_study.right_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_right = df_study_right.withColumn("right", f.explode(df_study_right.right_word_list))
                # convert to lowercase
                df_study_right = df_study_right.withColumn("right", f.lower(f.col("right")))
                df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right')
                # print("COUNT 2:", df_study_right.count())
                # df_study_right.printSchema()
                # df_study_right.show()
                dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right")
                ## Learning Object
                # dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                #     database="nvn_knowledge",
                #     table_name="nvn_knowledge_learning_object"
                # )
                dyf_learning_object = Filter.apply(frame=dyf_learning_object,
                                            f=lambda x: x["learning_object_type"] == 'vocabulary')
                dyf_learning_object = dyf_learning_object.select_fields(
                    ['learning_object_id', 'learning_object_name', 'transcription'])
                df_learning_object = dyf_learning_object.toDF()
                # convert to lowercase
                df_learning_object = df_learning_object.withColumn("learning_object_name", f.lower(f.col("learning_object_name")))
                # replace cac ky tu
                df_learning_object = df_learning_object.withColumn("phone_tic_new",
                                                                   f.translate(df_learning_object.transcription, '\',', ''))

                df_learning_object = df_learning_object.withColumn("phone_tic_tmp",
                                                                   splitWord(df_learning_object.phone_tic_new))
                df_learning_object = df_learning_object.withColumn("phone_tic_tmp_01",
                                                                   f.translate(df_learning_object.phone_tic_tmp, '[]',
                                                                               ''))
                df_learning_object = df_learning_object.withColumn("phone_tic_arr",
                                                                   f.split(df_learning_object.phone_tic_tmp_01, ','))

                df_learning_object = df_learning_object.withColumn("split_phonetic",
                                                                   f.explode(df_learning_object.phone_tic_arr))

                df_learning_object = df_learning_object.select('learning_object_id', 'learning_object_name',
                                                               'split_phonetic')

                dyf_learning_object = DynamicFrame.fromDF(df_learning_object, glueContext, "dyf_learning_object")

                dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name')


                # print("COUNT 3:", dyf_knowledge_right.count())
                # dyf_knowledge_right.printSchema()
                # 1
                df_knowledge_right = dyf_knowledge_right.toDF()
                # df_knowledge_right = df_knowledge_right.withColumn("right_phonetic",
                #                                                    f.explode(df_knowledge_right.phone_tic_arr))
                df_knowledge_right = df_knowledge_right.select('timestart', 'name', 'student_id', 'split_phonetic')
                df_knowledge_right = df_knowledge_right.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_right.split_phonetic))
                # dyf_phonemic_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_phonemic_right")



                # dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'split_phonetic', 'learning_object_name')
                #
                # dropnullfields = DropNullFields.apply(frame=dyf_phonemic_right, transformation_ctx="dropnullfields")
                # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history_v06",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/",
                #                                                            transformation_ctx="datasink6")

                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                # df_knowledge_right = dyf_phonemic_right.toDF()
                # print("COUNT 4:")
                # df_knowledge_right.printSchema()
                df_knowledge_right.cache()

                df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(2)) \
                    .withColumn("comprehension",
                                addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \
                    .withColumn("application",
                                addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \
                    .withColumn("synthesis",
                                addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \
                    .withColumn("evaluation", f.lit(0)) \
                    .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) \
                    .withColumn("lo_type", f.lit(2))

                dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                # dropnullfields = DropNullFields.apply(frame=dyf_knowledge_right, transformation_ctx="dropnullfields")
                # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history_v02",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/",
                #                                                            transformation_ctx="datasink6")

                # print("COUNT 444444444444444:", df_knowledge_right.count())
                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()
                #
                # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                # # chon cac truong va kieu du lieu day vao db
                # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right,
                #                                   mappings=[("timestart", "long", "timestart", "long"),
                #                                             ("student_id", 'int', 'student_id', 'long'),
                #                                             ("name", 'string', 'name', 'string'),
                #                                             ("learning_object_id", "long", "learning_object_id", "long"),
                #                                             ("date_id", "string", "date_id", "long"),
                #                                             ("knowledge", "int", "knowledge", "long"),
                #                                             ("comprehension", "int", "comprehension", "long"),
                #                                             ("application", "int", "application", "long"),
                #                                             ("analysis", "int", "analysis", "long"),
                #                                             ("synthesis", "int", "synthesis", "long"),
                #                                             ("evaluation", "int", "evaluation", "long"),
                #                                             ("lo_type", "int", "lo_type", "int")])
                # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                #                                     transformation_ctx="resolvechoice")
                # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
                #
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "t_temp_right_learning_object_phonetic",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/",
                #                                                            transformation_ctx="datasink5")
                # END Cong diem cac tu dung

                ##################################################
                # Tru diem cac tu sai: Xu lu tuong tu tu dung.
                # rule tru diem la -1 diem neu sai
                df_study_wrong = df_study.withColumn("wrong_word_list", f.split(
                    df_study.wrong_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_wrong = df_study_wrong.withColumn("wrong", f.explode(df_study_wrong.wrong_word_list))
                #convert to lowercase
                df_study_wrong = df_study_wrong.withColumn("wrong",  f.lower(f.col("wrong")))
                df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong')
                # print("COUNT 2222:", df_study_wrong.count())
                # df_study_wrong.printSchema()
                # df_study_wrong.show()
                dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong")
                ## Learning Object
                dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name')

                df_knowledge_wrong = dyf_knowledge_wrong.toDF()
                # df_knowledge_wrong = df_knowledge_wrong.withColumn("wrong_phonetic",
                #                                                    f.explode(df_knowledge_wrong.phone_tic_arr))
                df_knowledge_wrong = df_knowledge_wrong.select('timestart', 'name', 'student_id', 'split_phonetic')

                df_knowledge_wrong = df_knowledge_wrong.withColumn("learning_object_id",
                                                                   get_phone_tic_id(df_knowledge_wrong.split_phonetic))

                # dyf_study_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_study_wrong")

                # dyf_phonemic_wrong = Join.apply(dyf_study_wrong, dyf_phonemic, 'split_phonetic', 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                # df_knowledge_wrong = dyf_phonemic_wrong.toDF()
                df_knowledge_wrong.cache()

                df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-1)) \
                    .withColumn("comprehension",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \
                    .withColumn("application",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \
                    .withColumn("synthesis",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \
                    .withColumn("evaluation", f.lit(0)) \
                    .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd'))

                # df_knowledge_wrong.printSchema()
                # df_knowledge_wrong.show()
                #
                # dyf_knowledge_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_knowledge_wrong")
                #
                # # chon cac truong va kieu du lieu day vao db
                # applymapping1 = ApplyMapping.apply(frame=dyf_knowledge_wrong,
                #                                    mappings=[("timestart", "long", "timestart", "long"),
                #                                              ("name", 'string', 'name', 'string'),
                #                                              ("student_id", 'int', 'student_id', 'long'),
                #                                              ("id", "int", "learning_object_id", 'long'),
                #                                              ("date_id", "string", "date_id", "long"),
                #                                              ("knowledge", "int", "knowledge", "long"),
                #                                              ("comprehension", "int", "comprehension", "long"),
                #                                              ("application", "int", "application", "long"),
                #                                              ("analysis", "int", "analysis", "long"),
                #                                              ("synthesis", "int", "synthesis", "long"),
                #                                              ("evaluation", "int", "evaluation", "long")])
                # resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols",
                #                                      transformation_ctx="resolvechoice1")
                # dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1")
                #
                # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "t_temp_right_learning_object_phonetic",
                #                                                                "database": "dts_odin",
                #                                                                "postactions": """ call proc_knowledge_ngu_am_top_result_ai () """
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/",
                #                                                            transformation_ctx="datasink5")



                ### Luu bang mapping_lo_student_history
                df_knowledge_right = df_knowledge_right.groupby('student_id', 'date_id',
                                                                'learning_object_id').agg(
                    f.count('knowledge').alias("count_plus"),
                    f.sum('knowledge').alias("knowledge_plus"),
                    f.sum('comprehension').alias("comprehension_plus"),
                    f.sum('application').alias("application_plus"),
                    f.sum('analysis').alias("analysis_plus"),
                    f.sum('synthesis').alias("synthesis_plus"),
                    f.sum('evaluation').alias("evaluation_plus"))
                df_knowledge_right = df_knowledge_right.where('student_id is not null')

                df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id',
                                                                'learning_object_id').agg(
                    f.count('knowledge').alias("count_minus"),
                    f.sum('knowledge').alias("knowledge_minus"),
                    f.sum('comprehension').alias("comprehension_minus"),
                    f.sum('application').alias("application_minus"),
                    f.sum('analysis').alias("analysis_minus"),
                    f.sum('synthesis').alias("synthesis_minus"),
                    f.sum('evaluation').alias("evaluation_minus")) \
                    .withColumnRenamed('student_id', 'student_id_wrong') \
                    .withColumnRenamed('date_id', 'date_id_wrong') \
                    .withColumnRenamed('learning_object_id', 'learning_object_id_wrong')
                df_knowledge_wrong = df_knowledge_wrong.where('student_id_wrong is not null')
                df_knowledge = df_knowledge_right.join(df_knowledge_wrong, (
                        df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & (
                                                               df_knowledge_right['date_id'] ==
                                                               df_knowledge_wrong['date_id_wrong']) & (
                                                               df_knowledge_right['learning_object_id'] ==
                                                               df_knowledge_wrong['learning_object_id_wrong']), 'outer')
                df_knowledge = df_knowledge.withColumn("user_id",
                                check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \
                    .withColumn("learning_object_id",
                                check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \
                    .withColumn("created_date_id",
                                check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \
                    .withColumn("source_system", f.lit('top_result_ai_phonetic')) \
                    .withColumn("lu_id", f.lit(0))

                dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge")

                # dyf_knowledge.printSchema()
                dyf_knowledge.printSchema()
                dyf_knowledge.show()

                # dyf_knowledge = DynamicFrame.fromDF(dyf_knowledge, glueContext, "dyf_knowledge")
                # chon cac truong va kieu du lieu day vao db
                applymapping = ApplyMapping.apply(frame=dyf_knowledge,
                                                  mappings=[("user_id", 'string', 'student_id', 'long'),
                                                             ("learning_object_id", "string", "learning_object_id", "long"),
                                                             # ("knowledge", "int", "knowledge", "long"),
                                                             # ("comprehension", "int", "comprehension", "long"),
                                                             # ("application", "int", "application", "long"),
                                                             # ("analysis", "int", "analysis", "long"),
                                                             # ("synthesis", "int", "synthesis", "long"),
                                                             # ("evaluation", "int", "evaluation", "long"),
                                                             ("knowledge_plus", "long", "knowledge_plus", "long"),
                                                             ("comprehension_plus", "long", "comprehension_plus", "long"),
                                                             ("application_plus", "long", "application_plus", "long"),
                                                             ("analysis_plus", "long", "analysis_plus", "long"),
                                                             ("synthesis_plus", "long", "synthesis_plus", "long"),
                                                             ("evaluation_plus", "long", "evaluation_plus", "long"),
                                                             ("knowledge_minus", "long", "knowledge_minus", "long"),
                                                             ("comprehension_minus", "long", "comprehension_minus", "long"),
                                                             ("application_minus", "long", "application_minus", "long"),
                                                             ("analysis_minus", "long", "analysis_minus", "long"),
                                                             ("synthesis_minus", "long", "synthesis_minus", "long"),
                                                             ("evaluation_minus", "long", "evaluation_minus", "long"),
                                                             ("count_plus", "long", "plus_number", "long"),
                                                             ("count_minus", "long", "minus_number", "long"),
                                                             # ("lo_type", "string", "lo_type", "long"),
                                                             ("source_system", "string", "source_system", "string"),
                                                             ("created_date_id", "string", "created_date_id", "long"),
                                                             ("lu_id", "int", "lu_type", "long")
                                                             # ("student_level", "string", "student_level", "string"),
                                                             # ("advisor_id", "string", "advisor_id", "long"),
                                                             # ("package_code", "string", "package_code", "string")
                                                             ])
                resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                                                    transformation_ctx="resolvechoice")
                dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")

                print('START WRITE TO S3-------------------------')

                datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3",
                                                                         connection_options={
                                                                             "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/",
                                                                             "partitionKeys": ["created_date_id", "source_system"]},
                                                                         format="parquet",
                                                                         transformation_ctx="datasink6")
                print('END WRITE TO S3-------------------------')

                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/",
                #                                                            transformation_ctx="datasink5")


                ### END Luu bang mapping_lo_student_history
                # END Tru diem cac tu sai
                # lay max _key tren datasource
                datasource = dyf_top_quiz_attempts.toDF()
                flag = datasource.agg({"_key": "max"}).collect()[0][0]
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')

                # ghi de flag moi vao s3
                df.write.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai", mode="overwrite")
                # xoa cache
                df_study.unpersist()
                df_knowledge_right.unpersist()
                # df_knowledge_right.unpersist()
            except Exception as e:
                print("###################### Exception ##########################")
                print(e)
示例#22
0
distinct = lambda c: F.countDistinct(c)

count = F.count

sum = F.sum
sum_pos = lambda c: F.sum(F.when(c>0, c))
sum_neg = lambda c: F.sum(F.when(c<0, c))

min = F.min
max = F.max
avg = F.avg
stddev = F.stddev
skewness = F.skewness
kurtosis = F.kurtosis

digits_only = lambda c: F.sum((F.length(F.translate(c, '0123456789', ''))<F.length(c)).cast('int'))
spaces_only = lambda c: F.sum(((F.length(F.translate(c, ' \t', ''))==0) & (F.length(c)>0)).cast('int'))

all = {
    'type': typeof(),
    'integer': integer,
    'boolean': boolean,
    'top3': topn(),
    'percentiles': percentiles(),
    'null': null,
    'zero': zero,
    'empty': empty,
    'pos': pos,
    'neg':neg,
    'distinct': distinct,
    'sum':sum,
def main(argv):
    mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
        "SC_PHYS_PAGES")  # e.g. 4015976448
    mem_gib = int((mem_bytes / (1024.0**3)) * 0.9)
    tar_jar = os.path.join(find_runfiles(),
                           "__main__/galvasr2/spark/tar_spark_datasource.jar")
    spark = (pyspark.sql.SparkSession.builder.master(
        f"local[{os.cpu_count() - 1}]").config(
            "spark.eventLog.enabled",
            "true").config("spark.eventLog.dir", "/spark-events").config(
                "spark.sql.execution.arrow.pyspark.enabled", "true").config(
                    "spark.driver.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config(
                    "spark.executor.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config("spark.driver.memory", f"{mem_gib}g").config(
                    "spark.history.fs.logDirectory", "/spark-events").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        "1").config("spark.jars", tar_jar).config(
                            "spark.local.dir",
                            "/mnt/disks/spark-scratch/").getOrCreate())
    spark.sparkContext.setLogLevel("INFO")  # "ALL" for very verbose logging
    logging.getLogger("py4j").setLevel(logging.ERROR)

    catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue)

    _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue)
    licenseurl_df = licenseurl_df.select(
        [F.col("identifier"),
         F.col("text_document_id"),
         F.col("licenseurl")])

    # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file
    # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_"
    catalogue_df = catalogue_df.withColumn(
        "kaldi_normalized_uttid",
        F.concat_ws(
            "-",
            F.translate(catalogue_df.identifier, " /", "__"),
            F.translate(catalogue_df.audio_document_id, " /", "__"),
        ),
    )
    # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv")
    if not FLAGS.work_dir.startswith("gs://"):
        os.makedirs(FLAGS.work_dir, exist_ok=True)
    wav_scp = os.path.join(FLAGS.work_dir, "wav.scp")
    ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir")
    if FLAGS.stage <= 0:
        catalogue_df = catalogue_df.cache()
        # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping)
        training_sample_rows = catalogue_df.collect()
        catalogue_df.unpersist()

        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            create_wav_scp(posix_wav_scp, training_sample_rows,
                           FLAGS.input_dir, ctm_out_dir)

    # /development/lingvo-source/output_ctm_dir/

    # nvprof --analysis-metrics -o  decoder-analysis.nvprof \
    # We want only the best path, so we set lattice-beam to 0.1
    # --main-q-capacity=35000 \
    # Can get 266x RTF with this configuration. Keep it?
    # bath size of 100 and num channels of 100 works just fine

    if FLAGS.stage <= 1:
        if not FLAGS.work_dir.startswith("gs://"):
            os.makedirs(ctm_out_dir, exist_ok=True)
        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:

            posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                       temp_dir_name, ctm_out_dir)
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                    temp_dir_name, FLAGS.work_dir)
            num_gpus = 4
            posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir,
                                                 num_gpus)

            executor = ThreadPoolExecutor(max_workers=num_gpus)

            def run_gpu(posix_wav_scp_shard, gpu_number):
                cmd = f"""\
  /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \
  --frame-subsampling-factor=3 \
  --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \
  --max-active=7000 \
  --beam=15.0 \
  --lattice-beam=0.1 \
  --acoustic-scale=1.0 \
  --cuda-decoder-copy-threads=2 \
  --cuda-worker-threads={os.cpu_count() // num_gpus} \
  --segmentation=true \
  --cuda-use-tensor-cores=true \
  --max-batch-size=150 \
  --num-channels=250 \
  --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \
  --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \
  /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \
  /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \
  scp,p:{posix_wav_scp_shard} \
  {posix_ctm_out_dir}
  """
                env = deepcopy(os.environ)
                env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}"
                subprocess.check_call(shlex.split(cmd), env=env)

            for i, shard in enumerate(posix_wav_scp_shards):
                executor.submit(run_gpu, shard, i)
            executor.shutdown(wait=True)

    alignments_dir = os.path.join(FLAGS.alignments_work_dir,
                                  "alignments_json_jul_28")
    if FLAGS.stage <= 2:
        # TODO: Add options to DSAlign here
        dsalign_args = dsalign_main.parse_args(
            ["--output-wer",
             "--output-cer"])  # , "--output-sws", "--output-levenshtein"])

        alphabet_normalized_path = (
            "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt")
        align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path,
                                      15_000, 3_000)

        ctm_df = (spark.read.format("binaryFile").option(
            "pathGlobFilter", "*.ctm").load(ctm_out_dir))
        ctm_df = ctm_df.withColumn(
            "kaldi_normalized_uttid",
            F.regexp_replace(
                F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""),
        )
        ctm_df = ctm_df.withColumn("ctm_content",
                                   fix_text_udf(F.col("content"))).drop(
                                       "path", "length", "modificationTime",
                                       "content")

        ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid")
        downsampled_catalogue_df = ctm_df.drop("ctm_content")

        training_sample_rows = downsampled_catalogue_df.collect()
        transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path,
                                          training_sample_rows)
        transcripts_df = transcripts_df.withColumn(
            "transcript",
            normalize_english_text_udf(transcripts_df.transcript))
        ctm_df = ctm_df.join(transcripts_df,
                             ["identifier", "text_document_id"])
        ctm_df = ctm_df.repartition(960)

        # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id),
        #                                         F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id),
        #                                         ctm_df.transcript, ctm_df.ctm_content))
        alignments_df = ctm_df.withColumn(
            "alignments",
            align_udf(
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.text_document_id),
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.audio_document_id),
                ctm_df.transcript,
                ctm_df.ctm_content,
            ),
        ).drop("ctm_content")
        print("GALVEZ:schema")
        alignments_df.printSchema()

        sys.stdout.flush()

        alignments_df.write.mode("overwrite").format("json").save(
            alignments_dir)

    manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest")
    tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars")
    if FLAGS.stage <= 3:
        duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json"
        duplicates_df = spark.read.format("json").load(duplicate_data_path)

        alignments_df = spark.read.json(alignments_dir)

        alignments_df = alignments_df.join(
            duplicates_df,
            on=(alignments_df.identifier == duplicates_df.identifier)
            &
            (alignments_df.text_document_id == duplicates_df.text_document_id),
            how="anti",
        )

        if FLAGS.license_filter == "":
            pass
        else:
            if FLAGS.license_filter == "Not CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    ~is_cc_by_sa(F.col("licenseurl")))
            elif FLAGS.license_filter == "CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    is_cc_by_sa(F.col("licenseurl")))
            else:
                raise Exception("Unknown license_filter provided.")
            filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl")

            alignments_df = alignments_df.join(
                filtered_licenseurl_df,
                on=(alignments_df.identifier
                    == filtered_licenseurl_df.identifier)
                & (alignments_df.text_document_id
                   == filtered_licenseurl_df.text_document_id),
                how="inner",
            )
            alignments_df = alignments_df.drop(
                filtered_licenseurl_df.identifier).drop(
                    filtered_licenseurl_df.text_document_id)

        # We would like the number of partitions to be some large multiple
        # of the number of executors. Not every audio file is the same
        # length, so this helps with load balancing.
        alignments_df = alignments_df.withColumn(
            "duration_ms",
            F.expr(
                "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)"
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.arrays_zip(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.duration_ms,
            ).cast(
                T.ArrayType(
                    T.StructType([
                        T.StructField("cer", T.FloatType()),
                        T.StructField("end_ms", T.LongType()),
                        T.StructField("label", T.StringType()),
                        T.StructField("start_ms", T.LongType()),
                        T.StructField("wer", T.FloatType()),
                        T.StructField("duration_ms", T.LongType()),
                    ]))),
        )

        alignments_df = alignments_df.drop("duration_ms")

        alignments_df = alignments_df.withColumn(
            "alignments",
            F.filter(
                alignments_df.alignments,
                # Need to select this filter such that total number of
                # hours is 31,400
                lambda alignment:
                (alignment.duration_ms < FLAGS.max_duration_ms)
                & (alignment.duration_ms >= FLAGS.min_duration_ms)
                & (alignment.cer < FLAGS.max_cer)
                & (alignment.cer >= FLAGS.min_cer),
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.struct(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.alignments.duration_ms,
            ).cast(
                T.StructType([
                    T.StructField("cer", T.ArrayType(T.FloatType())),
                    T.StructField("end_ms", T.ArrayType(T.LongType())),
                    T.StructField("label", T.ArrayType(T.StringType())),
                    T.StructField("start_ms", T.ArrayType(T.LongType())),
                    T.StructField("wer", T.ArrayType(T.FloatType())),
                    T.StructField("duration_ms", T.ArrayType(T.LongType())),
                ])),
        )

        alignments_df = alignments_df.repartition(960)

        abc = alignments_df.select(
            F.sum(
                F.expr(
                    "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)"
                )) / 1000.0 / 60.0 / 60.0).collect()
        print("GALVEZ:total number of hours=", abc)
        sys.stdout.flush()

        alignments_df = alignments_df.select(
            alignments_df.identifier,
            alignments_df.audio_document_id,
            alignments_df.text_document_id,
            alignments_df.alignments,
        )

        alignments_df = F.broadcast(alignments_df)

        audio_paths = F.concat(
            F.lit(FLAGS.input_gcs_path),
            F.lit("/"),
            F.col("identifier"),
            F.lit("/"),
            F.col("audio_document_id"),
        )
        rows = alignments_df.select(audio_paths).collect()
        paths = [row[0] for row in rows]  # [:1] # GALVEZ: WARNING test!
        # print(f"number of paths = {len(paths)}")
        audio_df = (spark.read.format("binaryFile").load(paths).drop(
            "modificationTime", "length"))

        alignments_audio_df = alignments_df.join(audio_df,
                                                 audio_paths == audio_df.path)
        # from IPython import embed; embed()

        # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory
        # Remove "." becasue it has special meaning in webdataset format.
        # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient)
        name = F.concat(F.col("identifier"), F.lit("/"),
                        F.col("audio_document_id"))
        # name = F.regexp_replace(name, r"/", "_SLASH_")
        name = F.regexp_replace(name, r"\.", "_DOT_")
        name = F.regexp_replace(name, r" ", "_SPACE_")
        # glob.glob("**/*.flac")

        pdf = df.select(name).collect()
        for name in pdf.name:
            assert len(name) < 4096
            for chunk in "/".split(name):
                assert len(chunk) < 256
        # name = F.regexp_replace(F.concat(F.col("identifier"),
        #                                  F.lit("-"),
        #                                  F.col("audio_document_id")),
        #                         r"(\.|/)",
        #                         "_"
        # )

        # The name of each thing in the tar file. May not exceed 100 characters in length
        # substr indexes from 1!
        # name = name.substr(
        #     F.length(name) - F.least(F.length(name), F.lit(88)) + 1,
        #     F.least(F.length(name), F.lit(88))
        # )

        alignments_audio_df = alignments_audio_df.withColumn(
            "aligned_chunks",
            create_audio_segments_udf(
                alignments_audio_df.content,
                F.lit("mp3"),
                name,
                alignments_audio_df.alignments.start_ms,
                alignments_audio_df.alignments.end_ms,
                F.lit("flac"),
            ),
        )
        a = alignments_audio_df.select(
            F.explode(
                F.arrays_zip("aligned_chunks.audio_name",
                             "aligned_chunks.audio"))).select(
                                 "col.0", "col.1")
        a.write.mode("overwrite").format("tar").save(tars_dir)

        output_df = alignments_audio_df.select(
            alignments_audio_df.identifier,
            alignments_audio_df.audio_document_id,
            alignments_audio_df.text_document_id,
            F.struct(
                alignments_audio_df.alignments.label.alias("label"),
                create_audio_segment_names_udf(
                    # Is F.size right here?
                    name,
                    F.size(alignments_audio_df.alignments.start_ms),
                    F.lit("flac"),
                ).alias("name"),
                alignments_audio_df.alignments.duration_ms.alias(
                    "duration_ms"),
            ).alias("training_data"),
        )
        output_df = output_df.coalesce(960)

        # coalesce(1) seems to make the create_audio_segments_udf function run serially
        output_df.write.mode("overwrite").json(manifest_dir)

    repartitioned_tars_dir = os.path.join(FLAGS.work_dir,
                                          "repartitioned_dataset_tars")
    tmp_tars_dir = os.path.join(FLAGS.work_dir,
                                "repartitioned_dataset_tmp_dir")
    if FLAGS.stage <= 4:
        tars_df = spark.read.format("tar").load(tars_dir)  # .limit(100)
        number_of_rows = tars_df.count()

        spark2 = spark.newSession()
        spark2.conf.set(
            "spark.sql.execution.rangeExchange.sampleSizePerPartition",
            number_of_rows)
        spark2.conf.set("spark.sql.files.minPartitionNum",
                        FLAGS.number_of_shards)
        # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100)

        # print("GALVEZ:", tars_df.select(F.col("key")).collect())
        # import sys; sys.exit()
        tars_df = spark2.read.format("tar").load(tars_dir)  # .limit(100)
        tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards,
                                             F.col("key"))
        # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs.
        # # tars_df = tars_df.persist()
        tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir)
        tars_df = spark2.read.format("tar").load(
            tmp_tars_dir)  # .repartitionByRange()  # coalesce(1024)
        # counts_df = (
        #     tars_df.withColumn("partitionId", F.spark_partition_id())
        #     .groupBy("partitionId")
        #     .count()
        # )
        # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0]
        # # Consider doing this in java
        # def drop_final_rows(rows):
        #     for _ in range(num_rows_to_keep):
        #         yield next(rows)
        #     for _ in rows:
        #         pass
        #     return

        # print("GALVEZ:before=", tars_df.rdd.getNumPartitions())
        # # , preservesPartitioning=True
        # tars_df = spark2.createDataFrame(
        #     tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema
        # )
        # print("GALVEZ:after=", tars_df.rdd.getNumPartitions())
        # import sys

        # sys.stdout.flush()
        # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode.
        # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir)

        # manifest_df = spark2.read.json(manifest_dir)
        # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count()
        # print(f"GALVEZ:number_of_utterances={number_of_utterances}")
        # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards
        # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard)

    nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo")
    nemo_single_manifest_dir = os.path.join(FLAGS.work_dir,
                                            "dataset_manifest_nemo_single")

    if FLAGS.stage <= 5:
        json_df = spark.read.format("json").load(manifest_dir)
        nemo_df = json_df.select(
            F.explode(
                F.arrays_zip(
                    F.col("training_data.name").alias("audio_filepath"),
                    F.col("training_data.label").alias("text"),
                    F.col("training_data.duration_ms").alias("duration_ms"),
                )))
        nemo_df = nemo_df.select(
            F.col("col.name").alias("audio_filepath"),
            F.col("col.label").alias("text"),
            (F.col("col.duration_ms").cast(T.DoubleType()) /
             1000.0).alias("duration"),
            F.lit(-1).alias("shard_id"),
        )
        if False:
            tars_df = spark.read.format("tar").load(repartitioned_tars_dir)
            tars_df = tars_df.select(tars_df.key)
            nemo_df = F.broadcast(nemo_df)
            nemo_df = nemo_df.join(
                tars_df,
                F.col("audio_filepath") == F.col("key")).drop(F.col("key"))

        # TODO: Join against tar files that have been made to contain the
        # same number of files to filter out removed files
        nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir)

        nemo_single_df = spark.read.format("json").load(nemo_manifest_dir)
        nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save(
            nemo_single_manifest_dir)

    single_manifest_dir = os.path.join(FLAGS.work_dir,
                                       "dataset_manifest_single")
    single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single")
    # Create single tar file and single json file
    if FLAGS.stage <= 6:
        json_df = spark.read.format("json").load(manifest_dir)
        json_df.coalesce(1).write.format("json").mode("overwrite").save(
            single_manifest_dir)

        tars_df = spark.read.format("tar").load(tmp_tars_dir)
        tars_df.coalesce(1).write.format("tar").mode("overwrite").save(
            single_tar_dir)
示例#24
0
# regexp_replace function to replace substitute color names in our description column:
from pyspark.sql.functions import regexp_replace

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    col("Description"),
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean")).show(2, False)

# COMMAND ----------

# Another task might be to replace given characters with other characters. Spark provides the translate function to replace these values.
from pyspark.sql.functions import translate

df.select(col("Description"), translate(col("Description"), "LEET",
                                        "1337")).show(2)

# COMMAND ----------

# Using regex_extract we can pull the matching Strings from the column values.
from pyspark.sql.functions import regexp_extract

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
    regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
    col("Description")).show(2, False)

# COMMAND ----------

# Contains function is to just simply check for the existence of the String in column value.
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)


# COMMAND ----------

from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
  .show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
     regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
     col("Description")).show(2)


# COMMAND ----------

from pyspark.sql.functions import instr
示例#26
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace,col,translate,regexp_extract,instr
spark = SparkSession.builder.appName("Pyspark example").getOrCreate()

df= spark.read.format("csv").option("header","true").option("inferSchema","true").load("C:/Users/Lenovo/Desktop/spark_data/retail_store.csv")
#'regexp_replace' is used to replace substitute color names with NOCOLOR
str1="BLACK|WHITE|RED|BLUE|GREEN"
df.select(regexp_replace(col("Description"),str1,"NOCOLOR").alias("no_color_column"),col("Description")).show(5)

#'translate' function is to replace given characters with other characters
df.select(translate(col("Description"),"ABCD","1234"),col("Description")).show(5)

#'regexp_extract' is used to extract values
df.select(regexp_extract(col("Description"),str1,0).alias("color"),col("Description")).show(5)

#'instr' function checks for the existance of a value
containsRed= instr(col("Description"),"RED")>=1
containsWhite= instr(col("Description"),"WHITE")>=1
df.withColumn("hasColor",containsWhite| containsRed).where("hasColor").select("Description").show(5)
    rtrim(lit("   HELLO   ")).alias("rtrim"),
    trim(lit("   HELLO   ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2)

##정규 표현식
#description컬럼의 값을 COLOR 값으로 치환
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)

#주어진 문자를 다른 문자로 치환
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "WHI", "123")).show(2)

#color name 추출
from pyspark.sql.functions import regexp_extract

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
    regexp_extract(col("Description"), extract_str,
                   1).alias("color_clean")).show(6)

#data의 존재여부 확인
#instr
from pyspark.sql.functions import instr
containBlack = instr(col("Description"), "BLACK") >= 1
df.withColumn("HasSimpleColor",containBlack)\
  .where("HasSimpleColor")\
示例#28
0
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(25)
	
    userRecs_1 = userRecs.select(userRecs['userId'],userRecs['recommendations.movieId'].cast("string").alias('movies'))
	
    userRecs_2 = userRecs_1.select(userRecs_1['userId'],F.split(userRecs_1.movies,',').getItem(0),F.split(userRecs_1.movies,',').getItem(1),F.split(userRecs_1.movies,',').getItem(2),F.split(userRecs_1.movies,',').getItem(3),F.split(userRecs_1.movies,',').getItem(4),F.split(userRecs_1.movies,',').getItem(5),F.split(userRecs_1.movies,',').getItem(6),F.split(userRecs_1.movies,',').getItem(7),F.split(userRecs_1.movies,',').getItem(8),F.split(userRecs_1.movies,',').getItem(9),F.split(userRecs_1.movies,',').getItem(10),F.split(userRecs_1.movies,',').getItem(11),F.split(userRecs_1.movies,',').getItem(12),F.split(userRecs_1.movies,',').getItem(13),F.split(userRecs_1.movies,',').getItem(14),F.split(userRecs_1.movies,',').getItem(15),F.split(userRecs_1.movies,',').getItem(16),F.split(userRecs_1.movies,',').getItem(17),F.split(userRecs_1.movies,',').getItem(18),F.split(userRecs_1.movies,',').getItem(19),F.split(userRecs_1.movies,',').getItem(20),F.split(userRecs_1.movies,',').getItem(21),F.split(userRecs_1.movies,',').getItem(22),F.split(userRecs_1.movies,',').getItem(23),F.split(userRecs_1.movies,',').getItem(24))
	
    userRecs_2 = userRecs_2.withColumnRenamed("split(movies, ,)[0]", "movieid1").withColumnRenamed("split(movies, ,)[1]","movieid2").withColumnRenamed("split(movies, ,)[2]","movieid3").withColumnRenamed("split(movies, ,)[3]","movieid4").withColumnRenamed("split(movies, ,)[4]","movieid5").withColumnRenamed("split(movies, ,)[5]","movieid6").withColumnRenamed("split(movies, ,)[6]","movieid7").withColumnRenamed("split(movies, ,)[7]","movieid8").withColumnRenamed("split(movies, ,)[8]","movieid9").withColumnRenamed("split(movies, ,)[9]","movieid10").withColumnRenamed("split(movies, ,)[10]","movieid11").withColumnRenamed("split(movies, ,)[11]","movieid12").withColumnRenamed("split(movies, ,)[12]","movieid13").withColumnRenamed("split(movies, ,)[13]","movieid14").withColumnRenamed("split(movies, ,)[14]","movieid15").withColumnRenamed("split(movies, ,)[15]","movieid16").withColumnRenamed("split(movies, ,)[16]","movieid17").withColumnRenamed("split(movies, ,)[17]","movieid18").withColumnRenamed("split(movies, ,)[18]","movieid19").withColumnRenamed("split(movies, ,)[19]","movieid20").withColumnRenamed("split(movies, ,)[20]","movieid21").withColumnRenamed("split(movies, ,)[21]","movieid22").withColumnRenamed("split(movies, ,)[22]","movieid23").withColumnRenamed("split(movies, ,)[23]","movieid24").withColumnRenamed("split(movies, ,)[24]","movieid25")
	
    userRecs_2 = userRecs_2.withColumn('movieid1',F.translate('movieid1','[',''))
    userRecs_2 = userRecs_2.withColumn('movieid25',F.translate('movieid25',']',''))
	
    #userRecs_2.printSchema()
    #userRecs_2.show()

    #Import dataframe into MySQL
    
    #userRecs_2.write.format('jdbc').options(url='jdbc:mysql://us-cdbr-iron-east-05.cleardb.net/heroku_54c3b520208a1ef?useServerPrepStmts=false&rewriteBatchedStatements=true', driver='com.mysql.jdbc.Driver',dbtable='collab_reco',user='******',password='******').mode('append').save()

    spark.stop()
示例#29
0
    def __init__(self, spark):
        self.spark = spark
        df_raw = spark.read.format("csv").option("delimiter", ",") \
        .option("quote", "\"").option("escape", "\"") \
        .option("header", "true").option("inferSchema", "true") \
        .load("datasetfinaltotal.csv")
        df_raw1 = df_raw.dropna(how='any')
        df_raw1.show()
        #model goi y benh
        df_raw2 = df_raw1.select('lydo','chandoan', translate(col('lydo'),".;",",,").alias('trieuchung'))
        df_raw2 = df_raw2.select("trieuchung","chandoan").distinct()
        df_raw2 = df_raw2.withColumn('trieuchung', explode(split('trieuchung',',')))
        df_raw3 = df_raw2.select('trieuchung','chandoan').distinct()
        df_raw3 = df_raw3.withColumn('trieuchung', trim(col('trieuchung')))
        df_raw3 = df_raw3.select("trieuchung","chandoan").distinct()
        df_raw3 = df_raw3.filter(col('trieuchung')!="")
        df_raw3 = df_raw3.filter(length(regexp_replace("trieuchung", " ", " "))>2)
        df_raw3.show()
        
        pddataframe = df_raw3.toPandas()
        dfpd = pd.crosstab(pddataframe['trieuchung'], pddataframe['chandoan'])
        flattened = pd.DataFrame(dfpd.to_records())
        
        
        flattend1 = flattened.melt(id_vars=["trieuchung"], var_name="chandoan", value_name="rating")
        df_final = spark.createDataFrame(flattend1)
        self.df_final = df_final
        userIndexer = StringIndexer(inputCol='trieuchung',outputCol='trieuchungIndex').fit(df_final)
        itemIndexer = StringIndexer(inputCol='chandoan',outputCol='chandoanIndex').fit(df_final)
        pipeline = Pipeline(stages=[userIndexer, itemIndexer])
        df_testfinal = pipeline.fit(df_final).transform(df_final)
        df_testfinal.show()
        self.df_testfinal = df_testfinal
        train, test = df_testfinal.randomSplit([0.8,0.2])
        self.train = train
        self.test = test
        self.__trainmodelgoiybenh()
        userRecs = self.model.recommendForAllUsers(10)
        flatUserRecs = userRecs.withColumn("trieuchungandrating",explode(userRecs.recommendations)).select('trieuchungIndex','trieuchungandrating.*')
        userIndexer = StringIndexer(inputCol='trieuchung',outputCol='trieuchungIndex').fit(self.df_final)
        itemIndexer = StringIndexer(inputCol='chandoan',outputCol='chandoanIndex').fit(self.df_final)
        itemConverter = IndexToString(inputCol='chandoanIndex', outputCol='chandoan',labels=itemIndexer.labels)
        userConverter = IndexToString(inputCol='trieuchungIndex', outputCol='trieuchung', labels=userIndexer.labels)
        convertedUserRec = Pipeline(stages=[userConverter,itemConverter]).fit(self.df_testfinal).transform(flatUserRecs)
        self.convertedUserRec = convertedUserRec
        #mo hinh goi y thuoc
        df_goiythuoc = df_raw1.select('chandoan','tenhh').distinct()
        df_goiythuoc.show()
        
        pddataframegoiythuoc = df_goiythuoc.toPandas()
        dfpdgoiythuoc = pd.crosstab(pddataframegoiythuoc['chandoan'], pddataframegoiythuoc['tenhh'])
        flattenedgoiythuoc = pd.DataFrame(dfpdgoiythuoc.to_records())
        
        
        flattendgoiythuoc1 = flattenedgoiythuoc.melt(id_vars=["chandoan"], var_name="tenhh", value_name="rating")
        df_finalgoiythuoc = spark.createDataFrame(flattendgoiythuoc1)
        userIndexergoiythuoc = StringIndexer(inputCol='chandoan',outputCol='chandoanIndex').fit(df_finalgoiythuoc)
        itemIndexergoiythuoc = StringIndexer(inputCol='tenhh',outputCol='tenhhIndex').fit(df_finalgoiythuoc)

        pipeline = Pipeline(stages=[userIndexergoiythuoc, itemIndexergoiythuoc])
        df_testfinalgoiythuoc=pipeline.fit(df_finalgoiythuoc).transform(df_finalgoiythuoc)
        traingoiythuoc, testgoiythuoc = df_testfinalgoiythuoc.randomSplit([0.8,0.2])
        self.traingoiythuoc=traingoiythuoc
        self.testgoiythuoc=testgoiythuoc
        self.__trainmodelgoiythuoc()
        userRecsgoiythuoc = self.modelgoiythuoc.recommendForAllUsers(20)
        flatUserRecsgoiythuoc = userRecsgoiythuoc.withColumn("chuandoanandrating",explode(userRecsgoiythuoc.recommendations)).select('chandoanIndex','chuandoanandrating.*')
        userConvertergoiythuoc = IndexToString(inputCol='chandoanIndex', outputCol='chandoan',labels=userIndexergoiythuoc.labels)
        itemConvertergoiythuoc = IndexToString(inputCol='tenhhIndex', outputCol='tenhh',labels=itemIndexergoiythuoc.labels)
        convertedUserRecgoiythuoc = Pipeline(stages=[userConvertergoiythuoc,itemConvertergoiythuoc]).fit(df_testfinalgoiythuoc).transform(flatUserRecsgoiythuoc)
        self.convertedUserRecgoiythuoc=convertedUserRecgoiythuoc
示例#30
0
def text_clustering(dataFrame,
                    k_value,
                    w2v=False,
                    w2v_value=None,
                    seed=2137,
                    normalize=True,
                    plot=True):
    """
    args:
        -dataFrame: spark Data Frame
        -k_value: number of clusters in k-means algorithm
        -w2v: if True word2Vec is used and w2v_value must be specified, otherwise tf-idf is used
        -w2v_value: number of parameters to be returned with Word2Vec
        -seed: seed
        -normalize: should normalization after Word2Vec be performed?
        -plot: if True, clusters are visualized with the use of PCA
        
    """

    #Data preprocessing
    tokenizer = Tokenizer(inputCol="text", outputCol="words_raw")
    dataFrame = tokenizer.transform(dataFrame)
    remover = StopWordsRemover(inputCol="words_raw", outputCol="words")
    dataFrame = remover.transform(dataFrame)

    if w2v and w2v_value is None:
        raise ValueError('You have to give w2v_values parameter')

    if not w2v:  #tf-idf
        hashingTF = HashingTF(inputCol="words_raw",
                              outputCol="rawFeatures",
                              numFeatures=20)
        featurizedData = hashingTF.transform(dataFrame)
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        memes_df = idfModel.transform(featurizedData)

    else:  #word2vec
        word2Vec = Word2Vec(vectorSize=w2v_value,
                            seed=seed,
                            inputCol="words",
                            outputCol="features_unnormalized")
        model_w2v = word2Vec.fit(dataFrame)
        memes_df = model_w2v.transform(dataFrame)
        model_w2v.write().overwrite().save("hdfs:///models/model_w2v")

        if normalize:
            scaler = StandardScaler(inputCol="features_unnormalized",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=True)
            scalerModel = scaler.fit(memes_df)
            memes_df = scalerModel.transform(memes_df)

    #kmeans
    kmeans = KMeans(k=k_value, seed=seed)
    model_kmeans = kmeans.fit(memes_df)
    memes_df = model_kmeans.transform(memes_df)
    model_kmeans.write().overwrite().save("hdfs:///models/model_kmeans")

    #clustering evaluation
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(memes_df)

    centers = model_kmeans.clusterCenters()

    if plot:

        import matplotlib.pyplot as plt  #virtual environment might have problems if imported "the classical" way

        #pca
        pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
        model_pca = pca.fit(memes_df)
        memes_df = model_pca.transform(memes_df)
        #memes_df.show()

        centers_pca = [None] * len(centers)
        for i in range(len(centers)):
            centers_pca[i] = np.multiply(model_pca.pc.toArray().T,
                                         centers[i]).sum(axis=1)
        centers_pca = np.array(centers_pca)

        #plot section
        split_col = functions.split(memes_df["pcaFeatures"].cast(StringType()),
                                    ',')
        memes_df = memes_df.withColumn(
            'x',
            translate(split_col.getItem(0), "[", "").cast(DoubleType()))
        memes_df = memes_df.withColumn(
            'y',
            translate(split_col.getItem(1), "]", "").cast(DoubleType()))
        #memes_df.show(truncate = False)

        df = memes_df.toPandas()
        groups = df.groupby('prediction')
        fig, ax = plt.subplots()
        ax.margins(0.05)
        for name, group in groups:
            ax.plot(group.x,
                    group.y,
                    marker='o',
                    linestyle='',
                    ms=5,
                    label=name)
            ax.text(centers_pca[name, 0],
                    centers_pca[name, 1],
                    s=name,
                    fontsize=10)
        ax.legend()
        ax.title.set_text("k={0}, wn={1}, Silhouette={2}".format(
            k_value, w2v_value, silhouette))
        plt.show()
        print("PCA, explained variance= {0}".format(
            model_pca.explainedVariance))

    return memes_df