예제 #1
0
                      count_field="question_tags_count")

df = dc.set_years_between_dates(
    dataframe=df,
    start_date="questioner_account_creation_date",
    end_date="question_creation_date",
    years_between_field="questioner_years_since_joining")

df = dc.fill_na(dataframe=df,
                field_name="question_favorite_count",
                fill_value=0)

# Create categorical feature question_view_quantile from question_view_count
df = dc.create_categorical_feature(dataframe=df,
                                   base_field="question_view_count",
                                   categorical_field="question_view_quantile",
                                   levels=10,
                                   increment=0)

df = dc.create_binary_feature(dataframe=df,
                              base_field="question_favorite_count",
                              binary_field="question_favorited")
df = dc.create_binary_feature(dataframe=df,
                              base_field="answer_count",
                              binary_field="has_answer")

df.select("answer_count", "has_answer").show(20)

df = dc.create_length_feature(dataframe=df,
                              base_field="question_title",
                              length_field="question_title_length")
df = dc.drop_na_values(dataframe=df,field_names=["question_id","questioner_id","question_body_length","questioner_reputation","questioner_up_votes"])

# Fix data types
df = dc.fix_data_type(dataframe=df, field_names=["question_body_length","question_codeblock_count","answer_count","question_comment_count","questioner_id","questioner_up_votes",
                      "questioner_down_votes","accepted_answer_id","questioner_reputation","questioner_views","max_answer_score"], data_type='int')

df = dc.fix_data_type(dataframe=df, field_names=["questioner_account_creation_date","min_answer_creation_date"], data_type="timestamp")

df = dc.set_tag_count(dataframe=df, base_field="question_tags", count_field="question_tags_count")

df = dc.set_years_between_dates(dataframe=df, start_date="questioner_account_creation_date", end_date="question_creation_date", years_between_field="questioner_years_since_joining")

df = dc.fill_na(dataframe=df, field_name="question_favorite_count", fill_value=0)

# Create categorical feature question_view_quantile from question_view_count
df = dc.create_categorical_feature(dataframe=df, base_field="question_view_count", categorical_field="question_view_quantile", levels=10, increment=0)

df = dc.create_binary_feature(dataframe=df, base_field="question_favorite_count", binary_field="question_favorited")
df = dc.create_binary_feature(dataframe=df, base_field="answer_count", binary_field="has_answer")

df.select("answer_count","has_answer").show(20)

df = dc.create_length_feature(dataframe=df, base_field="question_title", length_field="question_title_length")

df = dc.create_tag_columns(dataframe=df, base_field="question_tags", max_tag_count=5)

df.select("question_tags","question_tags_count","tags_split","tag1","tag2","tag3","tag4","tag5").show(10)

df = dc.create_levels_column(dataframe=df, base_field="question_comment_count", levels_field="question_comment_level")

df = dc.create_valence_column(dataframe=df, base_field="question_score", valence_field="question_score_level")
dc = DataCleaner(sqlCtx)
df = dc.read_csv("/home/ubuntu/csv/so_bq_users.csv")

# Remove records that lack a user_id, user_display_name, user_reputation, questions_count, answers_count, or comments_count
df = dc.drop_na_values(dataframe=df,
                       field_names=[
                           "user_id", "user_display_name", "user_reputation",
                           "questions_count", "answers_count", "comments_count"
                       ])

print(df.printSchema())

# Create categorical feature user_reputation_quantile from user_reputation
df = dc.create_categorical_feature(
    dataframe=df,
    base_field="user_reputation",
    categorical_field="user_reputation_quantile",
    levels=5,
    increment=1)

print("Number of records:", df.count())

# Show count, min, max, etc. for up to 4 columns at a time
dc.show_stats(dataframe=df, batch_size=4)

# Export output to file
#dc.write_output("file:///home/ubuntu/csv/BigQueryUserOutputCleaner.csv")
dc.write_output(dataframe=df,
                path="/home/ubuntu/csv/BigQueryUserOutputCleaner.csv")
print("Completed clean_user_data.py")
exit(0)