예제 #1
0
def load_sentence_data_frame(sc, dataPath):
    df = SQLContext(sc).read.format('com.databricks.spark.csv') \
        .options(header='true', inferschema='true') \
        .load(dataPath)

    # 複製欄位(vector)
    df = df.withColumn("_vector", df['vector'])

    # 去除_vector的 [ 以及 ]
    df = df.select(
        df['id'], df['sentence'], df['vector'],
        regexp_replace(df['_vector'], "[\]\[]", "").alias("_vector"))

    # 分割_vector字串並且轉型
    df = df.select(
        df['id'], df['sentence'], df['vector'],
        split(df['_vector'], "  ").cast("array<double>").alias("_vector"))

    # 將double轉換為vectory再轉換為numpy array
    tmp = df.rdd.flatMap(lambda x: {
        Row(x['id'], x['sentence'], x['vector'], Vectors.dense(x['_vector']))
    })

    # 再轉換為dataframe
    df = SQLContext(sc).createDataFrame(tmp)\
            .selectExpr("_1 as id",
                        "_2 as sentence",
                        "_3 as vector",
                        "_4 as _vector")

    # 回傳dataframe
    return df