def __calc_distribution(
        self,
        df: DataFrame
    ) -> dict:
        """calculate distribution."""
        d = {}
        score_range = [
            0.00, 0.05, 0.10, 0.15, 0.20,
            0.25, 0.30, 0.35, 0.40, 0.45,
            0.50, 0.55, 0.60,
        ]
        for score in score_range:
            lower = '%.2f' % (score)
            if lower == '0.60':
                upper = 1.00
                index = str(lower) + ' - 1.00'
                d[index] = df.where(
                    (df['predicted_score'] >= lower) & (df['predicted_score'] <= upper)
                ).count()
            else:
                upper = '%.2f' % (score + 0.05)
                index = str(lower) + ' - ' + str(upper)
                d[index] = df.where(
                    (df['predicted_score'] >= lower) & (df['predicted_score'] < upper)
                ).count()

        return d
    def embed_vector_to_not_matched_words(self, df: DataFrame,
                                          df_vector_filler: DataFrame):
        not_matched_df = df.where(col('word_vector').isNull()).select(
            self.sentence_col_id, 'word')

        df3 = self.assign_alternative_match_word_based_on_lavenshtein(
            not_matched_df, df_vector_filler)

        return df3.alias('base').join(
            df_vector_filler.alias('filler'),
            df3.match == col('filler' + '.' + self.word_col_name),
            how='left').select(
                self.sentence_col_id,
                col('base' + '.' + 'word').alias('word'),
                col('filler' + '.' + 'word_vector').alias('word_vector'))
示例#3
0
def split(df: DataFrame, start: int or None, end: int or None) -> DataFrame:
    day_id_col = 'day_id'

    if not start and not end:
        return df
    else:
        cond1 = f.col(day_id_col) >= start
        cond2 = f.col(day_id_col) <= end

        if start and not end:
            cond = cond1
        elif not start and end:
            cond = cond2
        else:
            cond = cond1 & cond2

        return df.where(cond)
示例#4
0
    def getOnlyForm1MktEqOpt(inputDataFrame: DataFrame) -> DataFrame:

        outputDataFrame = inputDataFrame.where(pf.col('_c69') == 'FORM-1')

        return outputDataFrame