def handle(self, *args, **options): print('Start.') file_name = 'content_only_{}.txt'.format( timezone.now().strftime('%Y_%m_%d_%H_%M_%S')) f = open(file_name, 'wb') field_names = [ 'content', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) processor = TextProcessor() excluded_ids = [] if options['exclude_from_path']: with open(options['exclude_from_path'], 'rb') as g: r = DictReader(g) excluded_ids = [x['comment_id'] for x in r] i = 0 for status in Facebook_Status.objects_no_filters.filter( is_comment=False): for comment in status.comments.all(): if comment.comment_id in excluded_ids: continue if options['from_db']: processed_text = comment.processed_content else: processed_text = comment.content processed_text = processor.replace_mk_names( text=processed_text, context_status=comment.parent) if options['translate']: processed_text = processor.request_translated_text_from_google( text=processed_text) processed_text = processor.replace_emojis_to_named_text( text=processed_text) print('writing comment {}'.format(i + 1)) i += 1 dict_row = { 'content': processor.flatten_text(processed_text, delimiter=DELIMITER), } csv_data.writerow(dict_row) f.close() print('Done.')
def handle(self, *args, **options): print('Start.') file_name = 'content_only_{}.txt'.format(timezone.now().strftime('%Y_%m_%d_%H_%M_%S')) f = open(file_name, 'wb') field_names = [ 'content', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) processor = TextProcessor() excluded_ids = [] if options['exclude_from_path']: with open(options['exclude_from_path'], 'rb') as g: r = DictReader(g) excluded_ids = [x['comment_id'] for x in r] i = 0 for status in Facebook_Status.objects_no_filters.filter(is_comment=False): for comment in status.comments.all(): if comment.comment_id in excluded_ids: continue if options['from_db']: processed_text = comment.processed_content else: processed_text = comment.content processed_text = processor.replace_mk_names(text=processed_text, context_status=comment.parent) if options['translate']: processed_text = processor.request_translated_text_from_google(text=processed_text) processed_text = processor.replace_emojis_to_named_text(text=processed_text) print('writing comment {}'.format(i + 1)) i += 1 dict_row = { 'content': processor.flatten_text(processed_text, delimiter=DELIMITER), } csv_data.writerow(dict_row) f.close() print('Done.')
def handle(self, *args, **options): print('Start.') comments = self.parse_comments(options) file_name = '{}_full_data.csv'.format(options['file_path'].split('.csv')[0]) if options['second_stage']: file_name = ''.join([file_name.split('.')[0], '_2nd_stage.', file_name.split('.')[1]]) f = open(file_name, 'wb') if options['second_stage']: field_names = [ 'comment_id', 'MK_ID', 'mk_name', 'post_status_id', 'post_content', 'post_link', 'comment_link', 'comment_content', 'comment_content_processed', 'comment_time_of_publication', 'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'post_like_count', 'post_comment_count', 'post_share_count', 'comment_like_count', 'comment_comment_count', 'comment_main_language', 'POST_LEN_MESSAGE', 'COMMENT_LEN_MESSAGE', 'COMMENTATOR_LIKED_POST', 'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT', 'IDS_OF_MKS_MENTIONED_IN_COMMENT', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST', 'COMMENTATOR_ID', 'POLITICAL_WING_HATNUA_LEFT', 'POLITICAL_WING_HATNUA_CENTER', 'IS_COALITION', 'PARTY_NAME', 'IS_FEMALE', 'AGE', 'MK_POLITICAL_STATUS', 'MK_POLITICAL_SENIORITY', 'IS_CURRENT_OR_PAST_PARTY_LEADER', 'IS_CURRENT_OR_PAST_PM_CANDIDATE', 'IS_PM', 'POST_PUBLICATION_TIMESTAMP', 'POST_PUBLICATION_DATE', 'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'POST_WITH_PHOTO', 'POST_WITH_LINK', 'POST_WITH_VIDEO', 'POST_WITH_STATUS', 'POST_WITH_TEXT_ONLY', 'POST_IN_HEBREW', 'POST_IN_ENGLISH', 'POST_IN_ARABIC', 'POST_IN_OTHER', 'DAYS_FROM_ELECTION', 'DAYS_FROM_THREE_TEENAGER_KIDNAP', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE', 'DAYS_FROM_DUMA_ARSON_ATTACK', 'DAYS_FROM_THIRD_INTIFADA_START_DATE', 'DAYS_FROM_MK_BIRTHDAY', 'POST_PUBLISHED_ON_SATURDAY', 'COMMENT_PUBLISHED_ON_SATURDAY', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', ] else: field_names = [ 'comment_id', 'mk_id', 'mk_name', 'parent_status_id', 'parent_status_content', 'parent_status_link', 'comment_link', 'content', 'content_processed', 'published', 'commentator_id', 'commentator_also_liked_status', 'like_count', 'comment_count', 'language', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) headers = {field_name: field_name for field_name in field_names} csv_data.writerow(headers) processor = TextProcessor() for i, comment in enumerate(comments): processed_text = processor.replace_mk_names(text=comment.content, context_status=comment.parent) if options['translate']: processed_text = processor.request_translated_text_from_google(text=processed_text) processed_text = processor.replace_emojis_to_named_text(text=processed_text) print('writing comment {} of {}'.format(i + 1, comments.count())) if options['second_stage']: dict_row = self.get_second_stage_comment_features(comment, processed_text, processor) else: dict_row = self.get_first_stage_comment_features(comment, processed_text, processor) csv_data.writerow(dict_row) f.close() print('Done.')
def handle(self, *args, **options): print('Start.') comments = self.parse_comments(options) file_name = '{}_full_data.csv'.format( options['file_path'].split('.csv')[0]) if options['second_stage']: file_name = ''.join([ file_name.split('.')[0], '_2nd_stage.', file_name.split('.')[1] ]) f = open(file_name, 'wb') if options['second_stage']: field_names = [ 'comment_id', 'MK_ID', 'mk_name', 'post_status_id', 'post_content', 'post_link', 'comment_link', 'comment_content', 'comment_content_processed', 'comment_time_of_publication', 'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'post_like_count', 'post_comment_count', 'post_share_count', 'comment_like_count', 'comment_comment_count', 'comment_main_language', 'POST_LEN_MESSAGE', 'COMMENT_LEN_MESSAGE', 'COMMENTATOR_LIKED_POST', 'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT', 'IDS_OF_MKS_MENTIONED_IN_COMMENT', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST', 'COMMENTATOR_ID', 'POLITICAL_WING_HATNUA_LEFT', 'POLITICAL_WING_HATNUA_CENTER', 'IS_COALITION', 'PARTY_NAME', 'IS_FEMALE', 'AGE', 'MK_POLITICAL_STATUS', 'MK_POLITICAL_SENIORITY', 'IS_CURRENT_OR_PAST_PARTY_LEADER', 'IS_CURRENT_OR_PAST_PM_CANDIDATE', 'IS_PM', 'POST_PUBLICATION_TIMESTAMP', 'POST_PUBLICATION_DATE', 'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'POST_WITH_PHOTO', 'POST_WITH_LINK', 'POST_WITH_VIDEO', 'POST_WITH_STATUS', 'POST_WITH_TEXT_ONLY', 'POST_IN_HEBREW', 'POST_IN_ENGLISH', 'POST_IN_ARABIC', 'POST_IN_OTHER', 'DAYS_FROM_ELECTION', 'DAYS_FROM_THREE_TEENAGER_KIDNAP', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE', 'DAYS_FROM_DUMA_ARSON_ATTACK', 'DAYS_FROM_THIRD_INTIFADA_START_DATE', 'DAYS_FROM_MK_BIRTHDAY', 'POST_PUBLISHED_ON_SATURDAY', 'COMMENT_PUBLISHED_ON_SATURDAY', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', ] else: field_names = [ 'comment_id', 'mk_id', 'mk_name', 'parent_status_id', 'parent_status_content', 'parent_status_link', 'comment_link', 'content', 'content_processed', 'published', 'commentator_id', 'commentator_also_liked_status', 'like_count', 'comment_count', 'language', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) headers = {field_name: field_name for field_name in field_names} csv_data.writerow(headers) processor = TextProcessor() for i, comment in enumerate(comments): processed_text = processor.replace_mk_names( text=comment.content, context_status=comment.parent) if options['translate']: processed_text = processor.request_translated_text_from_google( text=processed_text) processed_text = processor.replace_emojis_to_named_text( text=processed_text) print('writing comment {} of {}'.format(i + 1, comments.count())) if options['second_stage']: dict_row = self.get_second_stage_comment_features( comment, processed_text, processor) else: dict_row = self.get_first_stage_comment_features( comment, processed_text, processor) csv_data.writerow(dict_row) f.close() print('Done.')