count_field="question_tags_count") df = dc.set_years_between_dates( dataframe=df, start_date="questioner_account_creation_date", end_date="question_creation_date", years_between_field="questioner_years_since_joining") df = dc.fill_na(dataframe=df, field_name="question_favorite_count", fill_value=0) # Create categorical feature question_view_quantile from question_view_count df = dc.create_categorical_feature(dataframe=df, base_field="question_view_count", categorical_field="question_view_quantile", levels=10, increment=0) df = dc.create_binary_feature(dataframe=df, base_field="question_favorite_count", binary_field="question_favorited") df = dc.create_binary_feature(dataframe=df, base_field="answer_count", binary_field="has_answer") df.select("answer_count", "has_answer").show(20) df = dc.create_length_feature(dataframe=df, base_field="question_title", length_field="question_title_length")
df = dc.drop_na_values(dataframe=df,field_names=["question_id","questioner_id","question_body_length","questioner_reputation","questioner_up_votes"]) # Fix data types df = dc.fix_data_type(dataframe=df, field_names=["question_body_length","question_codeblock_count","answer_count","question_comment_count","questioner_id","questioner_up_votes", "questioner_down_votes","accepted_answer_id","questioner_reputation","questioner_views","max_answer_score"], data_type='int') df = dc.fix_data_type(dataframe=df, field_names=["questioner_account_creation_date","min_answer_creation_date"], data_type="timestamp") df = dc.set_tag_count(dataframe=df, base_field="question_tags", count_field="question_tags_count") df = dc.set_years_between_dates(dataframe=df, start_date="questioner_account_creation_date", end_date="question_creation_date", years_between_field="questioner_years_since_joining") df = dc.fill_na(dataframe=df, field_name="question_favorite_count", fill_value=0) # Create categorical feature question_view_quantile from question_view_count df = dc.create_categorical_feature(dataframe=df, base_field="question_view_count", categorical_field="question_view_quantile", levels=10, increment=0) df = dc.create_binary_feature(dataframe=df, base_field="question_favorite_count", binary_field="question_favorited") df = dc.create_binary_feature(dataframe=df, base_field="answer_count", binary_field="has_answer") df.select("answer_count","has_answer").show(20) df = dc.create_length_feature(dataframe=df, base_field="question_title", length_field="question_title_length") df = dc.create_tag_columns(dataframe=df, base_field="question_tags", max_tag_count=5) df.select("question_tags","question_tags_count","tags_split","tag1","tag2","tag3","tag4","tag5").show(10) df = dc.create_levels_column(dataframe=df, base_field="question_comment_count", levels_field="question_comment_level") df = dc.create_valence_column(dataframe=df, base_field="question_score", valence_field="question_score_level")
dc = DataCleaner(sqlCtx) df = dc.read_csv("/home/ubuntu/csv/so_bq_users.csv") # Remove records that lack a user_id, user_display_name, user_reputation, questions_count, answers_count, or comments_count df = dc.drop_na_values(dataframe=df, field_names=[ "user_id", "user_display_name", "user_reputation", "questions_count", "answers_count", "comments_count" ]) print(df.printSchema()) # Create categorical feature user_reputation_quantile from user_reputation df = dc.create_categorical_feature( dataframe=df, base_field="user_reputation", categorical_field="user_reputation_quantile", levels=5, increment=1) print("Number of records:", df.count()) # Show count, min, max, etc. for up to 4 columns at a time dc.show_stats(dataframe=df, batch_size=4) # Export output to file #dc.write_output("file:///home/ubuntu/csv/BigQueryUserOutputCleaner.csv") dc.write_output(dataframe=df, path="/home/ubuntu/csv/BigQueryUserOutputCleaner.csv") print("Completed clean_user_data.py") exit(0)