Exemplo n.º 1
0
def toProcessTransData():
    df1=rawdata[0].withColumn('yymmref',F.col("year")*100+F.lit(7)) \
     .withColumn("ed_model_id",F.col("ed_model_id").cast(T.StringType())) \
     .withColumn("yymminitial",F.when(F.col("yymmref")<F.col("yymminitial"),F.col("yymmref")).otherwise(F.col("yymminitial"))) \
     .withColumn('iyear',F.col('yymminitial').substr(1,4).cast("integer")) \
     .withColumn('imonth',F.col('yymminitial').substr(5,6).cast("integer")) \
     .withColumn("idate",F.to_date(F.concat_ws("-","iyear","imonth",F.lit(1)))) \
     .withColumn("adate",F.to_date(F.concat_ws("-","saleyear","salemonth",F.lit(1)))) \
     .withColumn("fdate",F.lit(fdate)) \
     .withColumn("age",F.months_between(F.col("adate"),F.col('idate'))+1) \
     .withColumn("age",F.when(F.col('age')<1,1).otherwise(F.col('age')))\
     .withColumn("agef",F.months_between(F.col("fdate"),F.col('idate'))+1) \
     .withColumn("agef",F.when(F.col('agef')<1,1).otherwise(F.col('agef')))\
     .withColumn("pdate",F.expr("add_months(fdate,0)")) \
     .withColumn("tdate",F.expr("add_months(fdate,-3)")) \
     .withColumn("syrmmt", F.year('tdate')*100+F.month('tdate')) \
     .withColumn("psaleyear", F.year('pdate')) \
     .withColumn("psalemonth", F.month('pdate'))\
     .cache()
    extraagelist = df1.filter("age<0  and trans_count>=3").groupBy(
        'year', 'make', 'model').agg(F.min(F.col("syrmm")).alias('age1'))
    df1 = df1.filter("age>0")
    #extraagelist%.write.format("delta").option("overwriteSchema", "true").mode("overwrite").saveAsTable("stats.nn_shortterm_extraagelist")
    print(extraagelist.count())
    return df1
Exemplo n.º 2
0
def getDataPlacePoor(path):
    path01 = 'hdfs://localhost:9000/csv/Join_Canton'
    data01 = spark.read.format('csv').option('header', 'true').load(path01) \
        .where('Level= 04').drop('UpperCode', 'AllName', 'DT') \
        .withColumn('CantonCode', F.split('CantonCode', '\d{4}$')) \
        .withColumn('CantonCode', F.concat_ws("", "CantonCode"))

    data01 = data01.withColumn("CantonName", F.when(data01.CantonName == '建档立卡人员', '固阳县建档立卡人员') \
                               .otherwise(data01.CantonName))

    data = spark.read.format('parquet').load(path) \
        .select('PersonalType', 'AllName', 'DT', 'HosRegisterCode', 'CantonCode') \
        .dropDuplicates(subset=['HosRegisterCode']) \
        .where('PersonalType = 17') \
        .withColumn('CantonCode', F.split('CantonCode', '\d{4}$')) \
        .withColumn('CantonCode', F.concat_ws("", "CantonCode"))

    data = data.join(data01, on='CantonCode', how='left_outer') \
        .drop('PersonalType', 'Level', 'CantonCode', 'AllName', 'ZoneCode', 'HosRegisterCode') \
        .dropDuplicates(subset=['HosRegisterCode']) \
        .withColumn('Times', F.lit(1)) \
        .groupby('CantonName') \
        .pivot('DT', ['2017', '2018', '2019']) \
        .agg(F.sum('Times')) \
        .fillna(0)
    data = data.orderBy(data['2019'].desc())
    data.show(50)
    data.groupby().sum().show()  # 和为21335
Exemplo n.º 3
0
def read_arxiv(spark, processed_path):
    """Creates a dataframe with the columns:
    `id`: global id
    `source`: arxiv
    `source_id`: arxiv id
    `type`: publication
    `title`
    `venue`: concatenation of subjects
    `abstract`
    `scientists`: authors
    `organizations`: null
    `date`: publication date
    `content`: concatenation of abstract, affiliation, author, and journal
    """
    arxiv_path = os.path.join(processed_path, 'arxiv.parquet')
    arxiv_df = spark.read.parquet(arxiv_path)
    return arxiv_df.select(
        fn.concat(fn.lit('arxiv_'), fn.col('id')).alias('id'),
        fn.lit('arxiv').alias('source'),
        fn.col('id').astype('string').alias('source_id'),
        fn.lit('publication').alias('type'), 'title',
        fn.concat_ws('; ', 'subjects').alias('venue'), 'abstract',
        fn.concat_ws(';', 'authors').alias('scientists'),
        fn.lit(None).astype('string').alias('organizations'),
        fn.col('datastamp').alias('date'),
        fn.concat_ws(' ', fn.col('abstract'), fn.col('title'),
                     fn.concat_ws(' ', 'authors'),
                     fn.concat_ws(' ', 'subjects')).alias('content'),
        fn.lit(None).astype('string').alias('end_date'),
        fn.lit(None).astype('string').alias('city'),
        fn.lit(None).astype('string').alias('country'),
        fn.lit(None).astype('string').alias('other_id'))
Exemplo n.º 4
0
    def extract_data(self):
        """Method to extract data from the csv file."""

        works_data = self.data_path + '*'

        works_data_df = self.spark.read.load(works_data,
                                             format="csv",
                                             header="true")
        unicode_conversion = udf(lambda value: unicodedata.normalize(
            'NFKD', value).encode('ascii', 'ignore').decode())

        works_data_df = works_data_df.withColumn(
            'converted_title', unicode_conversion(col('title')))

        works_data_df = works_data_df.withColumn(
            'converted_contributors', unicode_conversion(col('contributors')))

        reconciled_data = works_data_df.select('*') \
                                            .groupBy('iswc') \
                                            .agg(concat_ws(', ', collect_set('converted_title')) \
                                            .alias('title'),
                                            concat_ws('|', collect_set('converted_contributors')) \
                                            .alias('contributors'),
                                            concat_ws(', ', collect_set('source')) \
                                            .alias('sources')) \
                                            .dropDuplicates() \
                                            .na.drop()

        return reconciled_data
def create_values(cols):
    values = []
    for col in cols:
        if col.is_lookup == 1:
            values.append(
                f.when(
                    f.col(col.demographic_key).isNull(),
                    f.concat_ws('_', f.lit(col.demographic_key),
                                f.lit('9999'))).when(
                                    f.trim(f.col(col.demographic_key)) == '',
                                    f.concat_ws('_',
                                                f.lit(col.demographic_key),
                                                f.lit('9999'))).
                when(
                    f.length(
                        f.regexp_extract(
                            f.col(col.demographic_key).astype('string'),
                            '(\d+)', 1)) > 0,
                    f.concat_ws(
                        '_', f.lit(col.demographic_key),
                        f.col(col.demographic_key).astype('int').astype(
                            'string'))).otherwise(
                                f.concat_ws('_', f.lit(col.demographic_key),
                                            f.col(col.demographic_key))))
        else:
            values.append(f.col(col.demographic_key))
    return values
Exemplo n.º 6
0
def createPipeline(readStream):
    split_col = split(readStream['value'], r" \[")
    message = split_col.getItem(1)
    systemDetails = split(split_col.getItem(0), ' ')

    currentYear = datetime.now().year
    month = systemDetails.getItem(1)
    date = systemDetails.getItem(2)
    time = systemDetails.getItem(3)
    source = systemDetails.getItem(4)

    fsm = split(split(systemDetails.getItem(5), '%').getItem(1), '-')
    facility = fsm.getItem(0)
    severity = fsm.getItem(1)
    mnemonic = fsm.getItem(2)

    udf = UserDefinedFunction(lambda x: MONTHS.get(x), StringType())

    return readStream.withColumn('timestamp', concat_ws(' ', concat_ws('-', lit(currentYear), udf(month), date), time).astype(TimestampType())) \
        .withColumn('source', source) \
        .withColumn('facility', facility) \
        .withColumn('severity', severity) \
        .withColumn('mnemonic', mnemonic) \
        .withColumn('message', concat(lit('['), message)) \
        .selectExpr("to_json(struct(timestamp, source, facility, severity, mnemonic, message)) AS value")
Exemplo n.º 7
0
def partition(spark, partition_config):
    unique_table = partition_config['unique_to']
    spark.sql("DROP TABLE IF EXISTS {}".format(unique_table))

    source = partition_config['source_tag']
    target = partition_config['target_tag']

    full = {}
    full[source] = spark.read.table(partition_config['full_source']) \
        .select('sub') \
        .dropDuplicates() \
        .withColumn('key', F.lit(''))
    full[target] = spark.read.table(partition_config['full_target']) \
        .select('sub') \
        .dropDuplicates() \
        .withColumn('key', F.lit(''))

    for p_conf in partition_config['partition_by']:
        no_attr = {}
        for src in [source, target]:
            p_by = spark.read.table(p_conf[src]) \
                .select('sub', F.lower(F.col('obj')).alias('new_key'))
            profile = full[src].join(p_by, 'sub', 'left')
            profile.cache()
            no_attr[src] = profile.filter(F.col('new_key').isNull()) \
                .select('sub', 'key')
            f = profile.filter(F.col('new_key').isNotNull()) \
                .select('sub', 'key', F.concat_ws('$', 'key', 'new_key').alias('new_key'))
            f.cache()
            full[src] = f

        unique = find_unique(full[source], full[target], 'sub', 'new_key', source, target) \
            .select('sub', 'db', F.col('new_key').alias('key'))
        unique.write.saveAsTable(unique_table, mode='append')

        # deal with ns
        t_k = full[target].select('key', 'new_key') \
            .dropDuplicates()
        n_s = no_attr[source].withColumn('new_key', F.lit('NULL')) \
            .select('sub', 'key', F.concat_ws('$', 'key', 'new_key').alias('new_key'))
        n_s_t = no_attr[source].join(t_k, 'key')
        # deal with nt
        s_k = full[source].select('key', 'new_key') \
            .dropDuplicates()
        n_t = no_attr[target].withColumn('new_key', F.lit('NULL')) \
            .select('sub', 'key', F.concat_ws('$', 'key', 'new_key').alias('new_key'))
        n_t_s = no_attr[target].join(s_k, 'key')

        full[source] = full[source].unionByName(n_s) \
            .unionByName(n_s_t) \
            .select('sub', F.col('new_key').alias('key'))
        full[target] = full[target].unionByName(n_t) \
            .unionByName(n_t_s) \
            .select('sub', F.col('new_key').alias('key'))

    full_s = full[source].withColumn('db', F.lit(source))
    full_t = full[target].withColumn('db', F.lit(target))
    result = full_s.unionByName(full_t)
    result.write.saveAsTable(partition_config['partition_to'], mode='overwrite')
    spark.catalog.clearCache()
    def group_batched_logs(logs):
        # group logs from uckey + interval_time + keyword.
        # group 1: group by uckey + interval_starting_time + keyword
        df = logs.groupBy('uckey', 'interval_starting_time', 'keyword_index').agg(
            first('keyword').alias('keyword'),
            fn.sum(col('is_click')).alias('kw_clicks_count'),
            fn.count(fn.when(col('is_click') == 0, 1).otherwise(
                0)).alias('kw_shows_count')
        )
        df = df.withColumn('kwi_clicks_count', concat_ws(
            ":", col('keyword_index'), col('kw_clicks_count')))
        df = df.withColumn('kwi_shows_count', concat_ws(
            ":", col('keyword_index'), col('kw_shows_count')))
        df = df.withColumn('kw_clicks_count', concat_ws(
            ":", col('keyword'), col('kw_clicks_count')))
        df = df.withColumn('kw_shows_count', concat_ws(
            ":", col('keyword'), col('kw_shows_count')))

        # group 2: group by uckey + interval_starting_time
        df = df.groupBy('uckey', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('kwi'),
            concat_ws(",", collect_list('kwi_clicks_count')
                      ).alias('kwi_click_counts'),
            concat_ws(",", collect_list('kwi_shows_count')
                      ).alias('kwi_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(",", collect_list('kw_clicks_count')
                      ).alias('kw_click_counts'),
            concat_ws(",", collect_list('kw_shows_count')
                      ).alias('kw_show_counts')
        )
        return df
def shortest_path(v_from, v_to, df_name, output, max_path_length=10):

    schema = StructType(fields=[
        StructField("user_id", StringType()),
        StructField("follower_id", StringType())
    ])

    df = spark.read.schema(schema).format("csv").option("sep",
                                                        "\t").load(df_name)

    df_sel = df.where(df.follower_id == v_from)
    df_paths = df_sel.select(
        f.concat_ws(",", "follower_id", "user_id").alias("path"),
        df_sel.user_id.alias("next"))
    for i in range(max_path_length):
        if df_paths.where(df_paths.next == v_to).count() == 0:
            df_ext = df_paths.join(df.select(df.follower_id.alias("next"),
                                             df.user_id),
                                   on="next",
                                   how="inner")
            df_paths = df_ext.select(
                f.concat_ws(",", "path", "user_id").alias("path"),
                df_ext.user_id.alias("next"))
        else:
            df_paths.select("path").where(
                df_paths.next == v_to).write.mode("overwrite").text(output)
            break
    spark.stop()
def _cross_features(need_cross_features: list, train_data: DataFrame,
                    test_data: DataFrame):
    cross_features = list()
    for item in need_cross_features:
        if not isinstance(item, dict) \
                or "feature_list" not in item.keys() \
                or "hash_bucket_size" not in item.keys():
            print("need_cross_features must be a dict "
                  "with key 'feature_list' and 'hash_bucket_size' !")
            continue

        # 连续特征使用 分桶后的数据交叉
        concat_features, hash_num = item["feature_list"], item[
            "hash_bucket_size"]
        new_feature_name = config.HASH_FEATURE_PREFIX + "_".join(
            concat_features)

        train_data = \
            train_data.withColumn(
                new_feature_name,
                _hash_value_udf(hash_num)(F.concat_ws("_", *concat_features)).cast("int"))

        test_data = \
            test_data.withColumn(
                new_feature_name,
                _hash_value_udf(hash_num)(F.concat_ws("_", *concat_features)).cast("int"))

        cross_features.append(new_feature_name)
        print("generate new crossed features {0}. ".format(new_feature_name))
    return train_data, test_data, cross_features
Exemplo n.º 11
0
def test_concat_ws_nulls_arrays():
    gen = ArrayGen(StringGen(nullable=True), nullable=True)
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark: binary_op_df(spark, gen).select(
                f.concat_ws("*", f.lit('z'), f.array(f.lit('2'), f.lit(None), f.lit('Z'))),
                f.concat_ws("*", f.array(f.lit(None), f.lit(None))),
                f.concat_ws("*", f.array(f.lit(None), f.lit(None)), f.col('b'), f.lit('a'))))
Exemplo n.º 12
0
def columnsMergeCore(df, requestDict):
    columnNames = requestDict['columnNames']
    # 默认分隔符是",",若requestStr中指定了分隔符,则以用户指定为准
    try:
        splitSymbol = requestDict['connector']
    except:
        splitSymbol = ','
    # 默认新列名称为:合并结果(col1, col2, col3, ...),若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "合并结果" + "(" + str(columnNames).strip("[]") + ")"

    # 合并(spark的dataframe操作好蠢,暂时先用笨办法合并吧 >_< )
    if len(columnNames) == 2:
        df = df.withColumn(
            newColumnName,
            concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]]))
    elif len(columnNames) == 3:
        df = df.withColumn(
            newColumnName,
            concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]],
                      df[columnNames[2]]))
    elif len(columnNames) == 4:
        df = df.withColumn(
            newColumnName,
            concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]],
                      df[columnNames[2]], df[columnNames[3]]))
    return df
Exemplo n.º 13
0
def token_score(df, on, value):
    q_val = value
    df = df.select([on])
    df = df.withColumn('query', F.lit(q_val).cast(F.StringType()))
    # TODO: implement the pattent
    pattern = ','
    df = df.withColumn('tokens1', F.split(F.col('left'), pattern))
    df = df.withColumn('tokens2', F.split(F.col('right'), pattern))
    # intersection = tokens1.intersection(tokens2)
    # diff1to2 = tokens1.difference(tokens2) = pure token 1
    # diff2to1 = tokens2.difference(tokens1) = pure token 2
    # TODO: implement an intersect and a diff method
    df = df.withColumn('intersection', F.intersect('tokens1', 'tokens2'))
    df = df.withColumn('diff1to2', F.diff('tokens1', 'tokens2'))
    df = df.withColumn('diff2to1', F.diff('tokens2', 'tokens1'))
    # sorted_sect = " ".join(sorted(intersection))
    # sorted_1to2 = " ".join(sorted(diff1to2))
    # sorted_2to1 = " ".join(sorted(diff2to1))
    # TODO: implement a concat for an array
    df = df.withColumn('sorted_sect',
                       F.concat_ws(' ', F.sort_array('intersection')))
    df = df.withColumn('sorted_1to2 ',
                       F.concat_ws(' ', F.sort_array('diff1to2')))
    df = df.withColumn('sorted_2to1', F.concat_ws(' ',
                                                  F.sort_array('diff2to1')))
    # combined_1to2 = sorted_sect + " " + sorted_1to2 = chain 1 that has been sorted
    # combined_2to1 = sorted_sect + " " + sorted_2to1 = chain 2 that has been sorted
    # TODO: no, i'm joking
    df = df.withColumn('combined_1to2',
                       F.concat_ws(' ', ['sorted_sect', 'sorted_1to2']))
    df = df.withColumn('combined_1to2',
                       F.concat_ws(' ', ['sorted_sect', 'sorted_2to1']))
    # strip
    # sorted_sect = sorted_sect.strip()
    # combined_1to2 = combined_1to2.strip()
    # combined_2to1 = combined_2to1.strip()
    for c in ['sorted_sect', 'combined_1to2', 'combined_2to1']:
        df = df.withColumn(c, F.trim(c))
    # TODO: create a function spark_ratio
    df = df.withColumn(
        'ratio1', spark_ratio(F.col('sorted_sect', F.col('combined_1to2'))))
    df = df.withColumn(
        'ratio2', spark_ratio(F.col('sorted_sect', F.col('combined_2to1'))))
    df = df.withColumn(
        'ratio3', spark_ratio(F.col('combined_2to1', F.col('combined_1to2'))))
    # pairwise = [
    #     ratio_func(sorted_sect, combined_1to2),
    #     ratio_func(sorted_sect, combined_2to1),
    #     ratio_func(combined_1to2, combined_2to1)
    # ]
    df = df.withColumn('max_ratio', F.max(['ratio1', 'ratio2', 'ratio3']))
    df = df.withColumnRenamed('max_ratio', 'token_fuzzy')
    df = df.select(['token_fuzzy'])
    return df
Exemplo n.º 14
0
def pivot(df):
    """convert to wide format"""
    df = (
        df
        # deterministic ordering for questions
        .withColumn('order_by', F.concat_ws('_', 'page_idx', 'question_idx'))
        .withColumn(
            'order_by',
            F.when(
                F.col('family') != 'single_choice',
                F.concat_ws('_', 'order_by', F.coalesce('choice_id', 'row_id', 'other_id'))
            ).otherwise(F.col('order_by'))
        )
        # enumerator for questions with same column name
        .withColumn('rank', F.dense_rank().over(Window.partitionBy('column').orderBy('question_id')))
        # construct orderable column names
        .withColumn('column', F.concat_ws('_', F.lit('_'), 'order_by', 'column', 'rank'))
        .groupBy(RESPONSE_KEY)
        .pivot('column')
        .agg(F.first('value'))
    )

    # set column order
    question_cols = set(df.columns) - set(RESPONSE_KEY)
    columns = RESPONSE_KEY + sorted(question_cols)
    df = df.select(*columns)

    # find single_choice questions with "Other" option
    questions_w_other = []
    base = columns[0]
    for col in columns:
        b = re.sub(r'_\d+$', '', base)  # don't consider enumerator
        # if column looks like `this_is_the_base_other`
        if b in col and 'other' in col:
            questions_w_other.append((base, col))
        base = col

    # inject "Other" for single choice questions
    for base, other in questions_w_other:
        df = df.withColumn(
            base,
            F.when(
                F.col(other).isNotNull(),
                F.coalesce(F.col(base),
                           F.lit('Other (please specify)'))).otherwise(
                               F.col(base)))

    # drop __question_id prefixes and _1 suffixes
    names = df.columns
    names = map(lambda s: re.sub(r'^__[\d+_]+', '', s), names)
    names = map(lambda s: re.sub(r'_1$', '', s), names)
    df = df.toDF(*names)
    return df
Exemplo n.º 15
0
def getNewAge(data):
    # 150221 1940 0212472x
    # data = spark.read.format('parquet').load(path)

    data = data.withColumn("Born", F.split('CertificateCode', '\d{7}.$')) \
        .withColumn('Born', F.concat_ws("", "Born")) \
        .withColumn('Born', F.split('Born', '\d{6}')) \
        .withColumn('Born', F.concat_ws("", "Born"))

    data = data.withColumn("Age", F.when((data.DT - data.Born) != data.Age, data.DT - data.Born).otherwise(data.Age)) \
        .drop('Born')
    data.show(30)
    dealNull(data)
Exemplo n.º 16
0
def getNewAge(path, path1):
    # 150221 1940 0212472x
    data = spark.read.format('csv').option('header', 'true').load(path)

    data = data.withColumn("Born", F.split('CertificateCode', '\d{7}.$')) \
        .withColumn('Born', F.concat_ws("", "Born")) \
        .withColumn('Born', F.split('Born', '\d{6}')) \
        .withColumn('Born', F.concat_ws("", "Born"))

    data = data.withColumn("Age", F.when((data.DT - data.Born) != data.Age, data.DT - data.Born).otherwise(data.Age)) \
        .drop('Born')
    data.show()
    data.write.format('parquet').mode("overwrite").save(path1)
Exemplo n.º 17
0
def df_structurize(input_df, struct):
    #metaColumns = struct.fieldNames()

    # new dataframe of the regex columns
    regexDFColumns = [c for c in input_df.columns if c[0].isdigit()]
    regexDFColumns.append("revid")
    regexDFColumns.append("date_time")
    regexDFColumns.append("articleid")
    regexDFColumns.append("namespace")
    regexDFColumns.append("anon")
    regexDFColumns.append("deleted")
    regexDFColumns.append("revert")
    regexDFColumns.append("reverteds")
    regex_df = input_df.na.replace('None', None).select(*regexDFColumns)
    #regex_df.show(n=5, vertical=True)

    # combine the regex columns into one column, if not None/null
    # this has: revid, article_id, date/time, regexes, core_regexes, regex_bool, core_bool
    onlyRegexCols = [c for c in regex_df.columns if c[0].isdigit()]
    coreDFColumn = findCoreColumns(onlyRegexCols)

    replaced_df = multi_replace_wps(onlyRegexCols)(regex_df)

    #test_df.select(regex_df.revid, regex_df.date_time, f.year(regex_df.date_time).alias("year"), f.month(regex_df.date_time).alias('month'),f.concat_ws(', ',*onlyRegexCols).alias('regexes'), f.concat_ws(', ',*coreDFColumn).alias('core_regexes')).show(n=50, truncate=200)

    #print("If we didn't do the replace stuff:")
    #regex_df.select(regex_df.revid, regex_df.date_time, f.year(regex_df.date_time).alias("year"), f.month(regex_df.date_time).alias('month'),f.concat_ws(', ',*onlyRegexCols).alias('regexes'), f.concat_ws(', ',*coreDFColumn).alias('core_regexes')).show(n=50, truncate =200)

    regex_one_df = replaced_df.select(
        regex_df.articleid, regex_df.namespace, regex_df.anon,
        regex_df.deleted, regex_df.revert, regex_df.reverteds, regex_df.revid,
        regex_df.date_time,
        f.year(regex_df.date_time).alias("year"),
        f.month(regex_df.date_time).alias('month'),
        f.concat_ws(', ', *onlyRegexCols).alias('regexes'),
        f.concat_ws(', ', *coreDFColumn).alias('core_regexes'))

    # if you don't want to use the replaced version, use this:
    # regex_one_df = regex_df.select(regex_df.articleid, regex_df.namespace, regex_df.anon, regex_df.deleted, regex_df.revert, regex_df.reverteds, regex_df.revid, regex_df.date_time, f.year(regex_df.date_time).alias("year"), f.month(regex_df.date_time).alias('month'),f.concat_ws(', ',*onlyRegexCols).alias('regexes'), f.concat_ws(', ',*coreDFColumn).alias('core_regexes'))

    # make again sure the empty ones are None/null
    regex_one_df = regex_one_df.na.replace('', None)

    ## regex_bool and core_bool help us keep track of which revisions end in text that have PI
    # regex_one_df = regex_one_df.select(*regex_one_df, f.when(regex_one_df.regexes.isNotNull(),1).otherwise(0).alias('regex_bool'), f.when(regex_one_df.core_regexes.isNotNull(),1).otherwise(0).alias('core_bool'))

    #regex_one_df.show(n=5, vertical=True)

    return regex_one_df
Exemplo n.º 18
0
def _setup_dataframe(spark,
                     sqlContext,
                     dataset_multiplier_factor,
                     append_ids=True) -> pyspark.sql.DataFrame:
    """Setup a pyspark dataframe to run against.

    Then creates a PySpark dataframe, and crossjoins with a table of length :dataset_multiplier_factor:
    to increase the volume of data for benchmarking.

    Returns:
        A Pyspark dataframe with random phrases for string distance testing.
    """
    df = _fetch_phrase_pairs()

    logger.info(f'{len(df):,} word pairs')

    pyspark_df = spark.createDataFrame(df, ['left', 'right'])

    pyspark_df = pyspark_df.repartition(10)
    pyspark_df.cache().count()

    logger.debug('Increasing data volume')

    range_df = sqlContext.range(dataset_multiplier_factor)

    if append_ids:

        range_df = range_df.withColumn('id_string',
                                       ps_funcs.lpad('id', 12, "0"))

        pyspark_df = range_df.crossJoin(pyspark_df).select(
            ps_funcs.concat_ws(' ', ps_funcs.col('left'),
                               ps_funcs.col('id_string')).alias('left'),
            ps_funcs.concat_ws(' ', ps_funcs.col('right'),
                               ps_funcs.col('id_string')).alias('right'))
    else:
        pyspark_df = range_df.crossJoin(pyspark_df).select(
            ps_funcs.col('left'), ps_funcs.col('right'))

    pyspark_df = pyspark_df.repartition(__DATASET_PARTITIONS)
    record_count = pyspark_df.cache().count()

    logger.info(f'Generated dataframe with {record_count:,} records')

    sample_data = pyspark_df.sample(withReplacement=False,
                                    fraction=0.01).limit(1).collect()
    logger.info(f'Sample of benchmarking data: {sample_data}')

    return pyspark_df
Exemplo n.º 19
0
def test_concat_ws_arrays():
    gen = ArrayGen(StringGen(nullable=True), nullable=True)
    (s1, s2) = gen_scalars(gen, 2, force_no_nulls=True)
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark: binary_op_df(spark, gen).select(
                f.concat_ws("*", f.array(f.lit('2'), f.lit(''), f.lit('3'), f.lit('Z'))),
                f.concat_ws("*", s1, s2),
                f.concat_ws("-", f.array()),
                f.concat_ws("-", f.array(), f.lit('u')),
                f.concat_ws(None, f.lit('z'), s1, f.lit('b'), s2, f.array()),
                f.concat_ws("+", f.lit('z'), s1, f.lit('b'), s2, f.array()),
                f.concat_ws("*", f.col('b'), f.lit('z')),
                f.concat_ws("*", f.lit('z'), s1, f.lit('b'), s2, f.array(), f.col('b')),
                f.concat_ws("-", f.array(f.lit(None))),
                f.concat_ws("-", f.array(f.lit('')))))
Exemplo n.º 20
0
    def transfLineitems(ds):
        invoiceDS = ds

        invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \
            .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto"))

        if invoiceDS.schema["TypeOfService"].dataType == ArrayType(
                StringType()):
            invoiceDS = invoiceDS.withColumn("TypeOfService", concat_ws(",", col("TypeOfService"))) \
                .withColumn("ServiceAmount", concat_ws(",", col("ServiceAmount")))

        invoiceDS = invoiceDS.withColumn(
            "TypeOfService", regexp_replace(col("TypeOfService"), "\n", " "))

        return invoiceDS
Exemplo n.º 21
0
def get_warning():
    try:
        print("删除过期预警数据")
        levels=["YJFL004","YJFL012","YJFL003","YJFL001"]
        for level in levels:
             print(f"预警级别:{level}")
             delete_all(hbase["table"],row_prefix=level)



        print(f"{str(dt.now())} 预警")
        result = get_warning_result(white_list, city='岳阳市', com_id='011114306', day='20190601',
                                    cluster_dir=cluster_path+"/")

        #highprice_30days_order 里面的数值要是float类型
        result["highprice_30days_order"] = result["highprice_30days_order"].apply(
            lambda x: json.dumps(x, ensure_ascii=False))


        df = spark.createDataFrame(result)\
                  .withColumn("classify_id",f.concat_ws("_",col("classify_level1_code"),col("cust_id")))

        cols=df.columns
        cols.remove("classify_id")
        df.foreachPartition(lambda x:write_hbase1(x,cols,hbase))

    except Exception:
        tb.print_exc()
Exemplo n.º 22
0
    def preprocessDF(self, df, cols):
        """
            Input: $df represents a DataFrame
                   $cols represents the list of columns (in $df) that will be concatenated and be tokenized

            Output: Return a new DataFrame that adds the "joinKey" column into the input $df

            Comments: The "joinKey" column is a list of tokens, which is generated as follows:
                     (1) concatenate the $cols in $df;
                     (2) apply the tokenizer to the concatenated string
            Here is how the tokenizer should work:
                     (1) Use "re.split(r'\W+', string)" to split a string into a set of tokens
                     (2) Convert each token to its lower-case
                     (3) Remove stop words
        """
        stop_words = self.stopWordsBC

        def tokenized_filterized_string(string):
            string = re.sub('\s+', ' ', string).strip().lower(
            )  # Remove extra whitespace and finally remove trailing spaces
            tokens = re.split(r'\W+', string)
            stop_words.add('')
            tokens = set(tokens) - stop_words
            return list(tokens)

        get_tokenized_string = functions.udf(
            tokenized_filterized_string, types.ArrayType(types.StringType()))
        concatanated_column = 'joinKey'
        df = df.withColumn(concatanated_column,
                           concat_ws(' ', df[cols[0]], df[cols[1]]))
        df = df.withColumn(concatanated_column,
                           get_tokenized_string(df[concatanated_column]))
        return df
    def preprocessDF(self, df, cols):

        new_df=df.withColumn("joinkey",concat_ws('-', *cols))


        def transform(raw):
            words=[]
            s = re.split(r'\W+', raw)

            final_list=[]
            for i in s:
             fin_s=i.lower()
             if(len(fin_s)>0):
                final_list.append(fin_s)


            for i in final_list:
                if i not in stop_word:
                    words.append(i)

            return words
        stop_word=self.stopWordsBC
        slen=udf(transform, ArrayType(StringType()))
        df1=new_df.withColumn("joinkey", slen(new_df.joinkey))

        return df1
Exemplo n.º 24
0
    def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None):
        """
        Function to load a supporting table to passengers from GCS and save in BigQuery.
        :param csv_filepath: str input filename
        :param uid_name: str name to give the UID column
        :param uid_col_list: list of str column names to combine into UID
        :param csv_bq: str output project.datset.table where the dat will be saved
        :param passenger_bq: str, optional. If passengers_df already has been loaded
        """
        csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath)
        logger.info(f"Loading address info from {csv_path}")
        csv_df = self.sparkql.read.csv(csv_path, header=True)

        csv_df = csv_df.withColumn(uid_name,
                                       sha2(concat_ws("",
                                                      *uid_col_list
                                                      ),
                                            256
                                            ))
        if passenger_bq:
            passengers_df = self.sparkql.read.format('bigquery') \
                                 .option('table', passenger_bq) \
                                 .load() \
                                 .withColumnRenamed('uid', 'passenger_uid')
        else:
            passengers_df = self.passengers_df.withColumnRenamed('uid', 'passenger_uid')

        csv_df = csv_df.join(passengers_df.select('email', 'passenger_uid'),
                                 on='email',
                                 how='left')
        logger.info(f"writing card data to {csv_bq}")
        csv_df.write.format('bigquery') \
          .option('table', csv_bq) \
          .save()
Exemplo n.º 25
0
 def verification(self, candDF, threshold):
     jaccard_udf = functions.udf(lambda r: jaccard_similarity(r))
     jaccard_df = candDF.withColumn(
         "jaccard",
         jaccard_udf(
             functions.concat_ws(',', candDF.joinKey1, candDF.joinKey2)))
     return jaccard_df.where(jaccard_df.jaccard >= threshold)
Exemplo n.º 26
0
    def csv2data(self, donotvectorize):
        lists = self.list_fileNames()
        #print lists
        csvFiles = []
        s3 = boto3.client('s3')
        for l in lists:
            csvFiles.append(l.encode('utf-8').split('/')[1])

        for c in csvFiles:
            print c
            s3.download_file(INPUT_DATA_BUCKET, INPUT_DATA_FOLDER + c,
                             "knn" + INPUT_DATA_TYPE)
        #datas = CreateDF.sc.wholeTextFiles("/home/ab/pyspark/knn"+INPUT_DATA_TYPE)
        #sessions = datas
        #print sessions.first()
        df = self.ReturnDataframe("/home/ab/pyspark/knn" + INPUT_DATA_TYPE)
        columns = df.columns
        columnlength = len(columns)
        print columnlength
        self.vectorize = list(
            set(self.dataframe.columns) - set(donotvectorize))
        self.dataframe = self.dataframe.withColumn(
            "Features_Joined", concat_ws('_', *self.vectorize))
        self.vectors = [i + "_index" for i in self.vectorize]
        df = self.StringtoVector(self.vectorize)
        y = self.dataframe.select(['Features_Joined'
                                   ]).rdd.map(lambda x: x[0]).collect()
        self.TransformDataframe(self.vectors)
def streaming_sent(dfX):
    # apply sentiment analysis to text stream
    df = pipeline.transform(dfX)

    # select sentiment column from pipeline output
    df = df.select('sentiment.result',"sentiment.metadata") \
        .withColumn('result',F.concat_ws(',','result')) \
        .withColumn("result", regexp_replace('result', "positive",'1')) \
        .withColumn("result", regexp_replace('result', "na",'0')) \
        .withColumn("result", regexp_replace('result', "negative",'-1')) \
        .select(F.split('result', ',').alias('sents'), 'metadata')

    # Convert datatypes
    mapper = F.udf(lambda x: [i['confidence'] for i in x],
                   T.ArrayType(T.StringType()))
    df = df.withColumn("metadata", mapper('metadata'))
    df = df.withColumn("metadata", df.metadata.cast("array<float>"))

    # Compute column product
    df_product = df.withColumn(
        "product",
        F.expr(
            "transform(arrays_zip(sents, metadata), x -> x.sents * x.metadata)"
        ))

    # Average array
    array_mean = F.udf(lambda x: float(np.mean(x)), T.FloatType())
    sent_df = df_product.select(array_mean("product").alias("value"))
    return sent_df
def etl_us_cities_demographics(spark, input_dir, output_dir):
    """Clean the us cities demograpgics data"""
    # this data set is clean
    # load data
    data_input_full_file_path = f'{input_dir}/us-cities-demographics.csv'
    us_cities_demographics_spark_df = spark.read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1", sep=';') \
        .load(data_input_full_file_path)

    us_cities_demographics_spark_df = us_cities_demographics_spark_df \
        .withColumnRenamed("City", "city") \
        .withColumnRenamed("State", "state") \
        .withColumnRenamed("Median Age", "median_age") \
        .withColumnRenamed("Male Population", "male_population") \
        .withColumnRenamed("Female Population", "female_population") \
        .withColumnRenamed("Total Population", "total_population") \
        .withColumnRenamed("Number of Veterans", "num_of_veterans") \
        .withColumnRenamed("Foreign-born", "foreign_born") \
        .withColumnRenamed("Average Household Size", "avg_house_size") \
        .withColumnRenamed("State Code", "state_code") \
        .withColumnRenamed("Race", "race") \
        .withColumnRenamed("Count", "count") \
        .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code'))))

    data_output_full_file_path = f'{output_dir}/us-cities-demographics.parquet'
    us_cities_demographics_spark_df \
        .write \
        .options(encoding="ISO-8859-1") \
        .mode('overwrite') \
        .parquet(data_output_full_file_path)
def etl_airport_code(spark, input_dir, output_dir):
    """Clean the airport code data"""

    # load data
    airport_code_data_input_full_file_path = f'{input_dir}/airport-codes_csv.csv'
    airport_code_spark_df = spark.read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
        .load(airport_code_data_input_full_file_path)

    airport_code_spark_df = airport_code_spark_df \
        .withColumnRenamed('name', 'airport_name') \
        .filter(F.col('iso_country') == 'US')

    # split iso_region column into Latitude and Longitude
    split_iso_region = F.split(airport_code_spark_df['iso_region'], '-')
    airport_code_spark_df = airport_code_spark_df \
        .withColumn('region', split_iso_region.getItem(1)) \
        .withColumn('municipality_region', F.concat_ws(', ', F.upper(F.col('municipality')), F.upper(F.col('region'))))

    new_airport_code_spark_df = airport_code_spark_df \
        .drop('iso_region') \
        .drop('coordinates')

    data_output_full_file_path = f'{output_dir}/airport-codes.parquet'
    new_airport_code_spark_df \
        .write \
        .options(encoding="ISO-8859-1") \
        .mode('overwrite') \
        .parquet(data_output_full_file_path)
Exemplo n.º 30
0
    def nest(input_cols, output_col, shape="string", separator=""):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """

        df = self

        if has_(input_cols, F.Column):
            # Transform non Column data to lit
            columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols]
        else:
            columns = parse_columns(self, input_cols)

        if shape is "vector":
            columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

            vector_assembler = VectorAssembler(
                inputCols=columns,
                outputCol=output_col)
            df = vector_assembler.transform(df)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":
            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIt.value_error(shape, ["vector", "array", "string"])

        return df
Exemplo n.º 31
0
def load_world_news(spark: SparkSession, input_path: str, bucket: str) -> None:
    """
    Loads the world news to S3. There are 25 news pieces per day, which are concatenated in a single row, using the
    asterisk (*) as a separator.

    :param spark: spark session - must be configured to use S3 (AWS credentials, file system)
    :param input_path: input path from where the files should be read.
    :param bucket: S3 bucket where to write data
    :return: None
    """
    df_world_news = spark.read.option("header",
                                      "true").option("delimiter",
                                                     ",").csv(input_path)

    df_world_news = df_world_news.groupBy('Date').agg(
        functions.concat_ws('*', functions.collect_list(
            functions.col('News'))).alias('news'))

    # Filter out badly formatted Date columns
    df_world_news = df_world_news \
        .filter(functions.to_date(functions.col('Date')).isNotNull()) \
        .orderBy('Date', ascending=False)

    df_world_news = df_world_news.withColumn('year', functions.year('Date'))

    df_world_news \
        .coalesce(1) \
        .write \
        .partitionBy('year') \
        .mode('overwrite') \
        .csv(f"s3a://{bucket}/data/news")
Exemplo n.º 32
0
 def preprocessDF(self, df, cols): 
     tokenize_udf = udf(lambda line: lineTokenizer(line),ArrayType(StringType(), False))
     df_joinkey = df.withColumn("joinKey", tokenize_udf(concat_ws(' ', cols[0], cols[1]).alias('joinKey'))).cache() #can we remove hardcoding of the cols!!!!!
     return df_joinkey
Exemplo n.º 33
0
def prepareDatasets(sc, spark):
    buisHeader = ['business_id', 'name', 'neighborhood', 'address', 'city', 'state', 'postal_code',
        'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories']
    buis = sc.textFile(datapath+'yelp_business.csv', use_unicode=False)
    buis = buis.filter(lambda row: not row.startswith('business_id,name'))\
        .map(lambda row: re.findall(r'(?:[^,"]|"(?:\\.|[^"])*")+', row.replace(',,', ', ,')))\
        .map(lambda row: map(lambda x: x.replace('"', ''), row))\
        .map(lambda row: dict(zip(buisHeader, row)))\
        .filter(lambda row: row['business_id'] and row['longitude'] and row['latitude'])\
        .filter(lambda row: row['business_id'].strip() and row['longitude'].strip() and row['latitude'].strip())\
        .toDF()
    buis = buis.select('business_id', 'name', 'city', 'state', 'postal_code', 'categories',
                        buis['latitude'].cast('float'), buis['longitude'].cast('float'),
                        buis['stars'].cast('float'), buis['review_count'].cast('int'),
                        buis['is_open'].cast('int'))\
        .dropna(how='any', subset=['business_id','longitude', 'latitude'])

    def reviews_mapper(index, lines):
        import csv
        reader = csv.reader(lines)
        if index==0: lines.next()
        for row in reader:
            if len(row) == 9 and len(row[1])==22:
                yield row
    reviewsHeader = ["review_id","user_id","business_id","stars","date","text","useful","funny","cool"]
    reviews = sc.textFile(datapath+'yelp_review.csv', use_unicode=False)\
        .mapPartitionsWithIndex(reviews_mapper)\
        .map(lambda x: dict(zip(reviewsHeader, x)))\
        .toDF()
    reviews = reviews.select(
        "review_id", "user_id", "business_id", "text",
        reviews["stars"].cast('float'), reviews["date"].cast('date'),
        reviews["useful"].cast('int'), reviews["funny"].cast('int'),
        reviews["cool"].cast('int'))\
        .filter(reviews.text.isNotNull())\
        .filter(reviews.business_id.isNotNull())
    reviews = reviews.alias('a').join(buis.alias('b'),
        sf.col('b.business_id') == sf.col('a.business_id'))\
        .select('b.*','a.text') #,'a.user_id')
    reviews = reviews.where(
        'longitude > {:f} and longitude < {:f} and latitude > {:f} and latitude < {:f}'\
        .format(westAMER, eastAMER, southAMER, northAMER)
    ).cache()

    id_text = reviews.select('business_id', 'text')\
        .groupBy('business_id').agg(sf.concat_ws(' ', sf.collect_list("text")).alias('text_concat'))
    reviews = reviews.drop(reviews.text)\
        .select('business_id','categories','state', 'stars')\
        .alias('a').join(id_text.alias('b'),
        sf.col('b.business_id') == sf.col('a.business_id'))\
        .select('a.*','b.text_concat')\
        .distinct()\
        .withColumnRenamed('text_concat', 'text')

    # some data cleansing:
    reviews = reviews.withColumn('text', sf.regexp_replace(reviews.text, '\\/', '/'))
    def cleanse(text):
        re_punc = re.compile('[' + re.escape(punctuation) + '0-9\\n\\t\\r]')
        re_spc = re.compile('[ ]+') # get rid of extra spaces
        return re_spc.sub(' ', re_punc.sub(" ", text))
    cleanser = sf.udf(lambda x: cleanse(x))
    reviews = reviews.withColumn('text', cleanser('text'))
    # tokinizing and removing stop words:
    import pyspark.ml.feature as sparkml
    from pyspark.ml import Pipeline
    tokenizer = sparkml.Tokenizer(inputCol="text", outputCol="words")
    swremover = sparkml.StopWordsRemover(inputCol='words', outputCol='words_clean')
    pipeline = Pipeline(stages=[tokenizer, swremover])
    reviews = pipeline.fit(reviews).transform(reviews)
    reviews = reviews.drop('text', 'words')
    return reviews.cache()
Exemplo n.º 34
0
if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: pretty-cluster.py <input> <page-out> <book-out>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Proteus Pages")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.load(sys.argv[1])
    cols = set(raw.columns)
    idcols = [col(x) for x in ['identifier', 'issue', 'book'] if x in cols]

    df = raw.withColumn('identifier', regexp_replace(coalesce(*idcols), '[^A-Za-z0-9]+', ''))

    counts = df.groupBy('identifier').count().select(col('identifier'), col('count').alias('imagecount'))

    appendID = udf(lambda book, text: '%s <archiveid tokenizetagcontent="false">%s</archiveid>' % (text, book))

    renamed = df.join(counts, 'identifier')\
                .drop('regions')\
                .withColumn('pageNumber', col('seq'))\
                .withColumn('name', concat_ws('_', col('identifier'), col('seq')))\
                .withColumn('text', regexp_replace(col('text'), '\\n', '<br>\\\n'))

    renamed.withColumn('text', appendID(col('identifier'), col('text')))\
           .write.format('json').save(sys.argv[2])

    renamed.rdd.groupBy(lambda r: r.identifier).map(pageCat).toDF()\
        .write.format('json').save(sys.argv[3])

    sc.stop()
Exemplo n.º 35
0
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import lit, concat, concat_ws, regexp_replace

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: trove-load.py <input json> <output parquet>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Trove Load")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.json(sys.argv[1])
    df = raw.na.drop(subset=['id', 'fulltext']).dropDuplicates(['id'])
    df.select(concat(lit('trove/'), df.id).alias('id'),
              concat_ws('/', lit('trove'), df.titleId, df.date).alias('issue'),
              concat(lit('trove/'), df.titleId).alias('series'),
              df.date, df.firstPageId, df.firstPageSeq.cast('int').alias('seq'),
              df.heading.alias('title'), df.category,
              regexp_replace(regexp_replace(df.fulltext, '&', '&amp;'),
                             '<', '&lt;').alias('text'))\
      .write.save(sys.argv[2])

    sc.stop()
Exemplo n.º 36
0
    # word_tokenize uses PunktSentenceTokenizer first, then
    # treebank_word_tokenizer on those so can get nested
    # lists.
    #return nltk.tokenize.word_tokenize(s)

    # this is just the treebank tokenizer
    return [word for word in t.tokenize(s) if word not in stopwords_set]

udf_tokenize = sql.udf(tokenize, types.ArrayType(types.StringType()))


(idb_df
    .select(sql.concat_ws(" ", idb_df["data.dwc:occurrenceRemarks"],
                          idb_df["data.dwc:eventRemarks"],
                          idb_df["data.dwc:fieldNotes"]
                         )
                        .alias("note"),
                        idb_df["uuid"]
            )
    .where(sql.column("note") != "")
    .withColumn("tokens", udf_tokenize(sql.column("note")))
    .select(sql.column("uuid"),
            sql.explode(sql.column("tokens")).alias("token")
           )
    .groupBy(sql.column("uuid"), sql.column("token"))
    .count()
    .write
    .mode("overwrite")
    .parquet("/guoda/data/idigbio-{}-tf.parquet".format(idb_df_version))
)
Exemplo n.º 37
0
        print("Usage: pretty-cluster.py <metadata> <input> <output> [<query>]", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Prettyprint Clusters")
    sqlContext = SQLContext(sc)

    outpath = sys.argv[3]
    (outputFormat, outputOptions) = guessFormat(outpath, "json")

    ## Should do more field renaming in meta to avoid clashing with fields in raw.
    meta = sqlContext.read.json(sys.argv[1])\
           .dropDuplicates(['series'])
    
    constructURL = udf(lambda url, corpus, id, regions: formatURL(url, corpus, id, regions))

    df = sqlContext.read.load(sys.argv[2]) \
        .withColumnRenamed('title', 'doc_title')\
        .withColumnRenamed('lang', 'doc_lang')\
        .withColumn('url', constructURL(col('page_access'), col('corpus'), col('id'), col('regions')))\
        .drop('locs').drop('pages').drop('regions')\
        .join(meta, 'series', 'left_outer')

    filtered = df.join(df.filter(sys.argv[4]).select('cluster').distinct(), 'cluster') \
               if len(sys.argv) >= 5 else df

    filtered.withColumn('lang', concat_ws(',', col('lang'))) \
            .orderBy(desc('size'), 'cluster', 'date', 'id', 'begin')\
            .write.format(outputFormat).options(**outputOptions).save(outpath)

    sc.stop()
    
Exemplo n.º 38
0
	sqlContext = SQLContext(sc)

	# people DF
	RDD = sc.textFile(datapath+"people.csv",minPartitions = 6,use_unicode = False)
	header = RDD.first()
	RDD = RDD.filter(lambda x: x!=header)
	PeopleDF = RDD.map(lambda x: (x.split(",")[0],x,float(x.split(",")[-1]))).toDF(["PeopleID","PeopleFeats","numeric"]).cache()
	print "number of distinct people: "
	print PeopleDF.count()	
	# train DF
	RDD = sc.textFile(datapath+"act_train.csv",minPartitions = 6,use_unicode = False)
	header = RDD.first()
	RDD = RDD.filter(lambda x: x!=header)
	trainDF = RDD.map(lambda x: (x.split(",")[0],x)).toDF(["PeopleID","trainFeats"])
	trainDF = trainDF.join(PeopleDF,"PeopleID","left_outer")
	trainDF = trainDF.select("PeopleID",concat_ws(',',trainDF.PeopleFeats,trainDF.trainFeats).alias("text"))
	trainRDD = trainDF
	print "number of train samples: "
	print trainDF.count()	

	
	RDD = sc.textFile(datapath+"act_test.csv",minPartitions = 6,use_unicode = False)
	header = RDD.first()
	RDD = RDD.filter(lambda x: x!=header)
	testDF = RDD.map(lambda x: (x.split(",")[0],x)).toDF(["PeopleID","trainFeats"])
	testDF = testDF.join(PeopleDF,"PeopleID","left_outer")
	testDF = testDF.select("PeopleID",concat_ws(',',testDF.PeopleFeats,testDF.trainFeats).alias("text"))
	print "number of test samples: "
	print testDF.count()	

	"""