def write_party_name(self, csv_data, feed_ids):
     processor = TextProcessor()
     feed_to_mk_id = {
         x: processor.flatten_text(self.get_party_name_if_exists(x),
                                   delimiter=DELIMITER)
         for x in feed_ids
     }
     feed_to_mk_id = self.add_empty_commentator_rows(feed_to_mk_id,
                                                     commentator_id='MK_ID')
     csv_data.writerow(feed_to_mk_id)
Exemplo n.º 2
0
    def handle(self, *args, **options):
        """
        Executes processcommentcontent manage.py command.
        Receives one or more status ids.
        takes all comments for status(es) and saves processed_content field after text manipulation.
        """

        list_of_statuses = self.parse_statuses(args, options)
        processor = TextProcessor()
        # Iterate over list_of_statuses
        i = 1
        if not options['workers']:
            for status in list_of_statuses:
                for comment in status.comments.all():
                    self.worker(i, comment, status, processor)
                    i += 1
        else:
            with futures.ThreadPoolExecutor(
                    max_workers=options['workers']) as executer:
                for status in list_of_statuses:
                    for comment in status.comments.all():
                        executer.submit(self.worker, i, comment, status,
                                        processor)
                        i += 1
        info_msg = "Successfully saved all statuses to db"
        logger = logging.getLogger('django')
        logger.info(info_msg)
        self.stdout.write('Successfully saved all statuses to db.')
    def handle(self, *args, **options):
        print('Start.')

        comments = self.parse_comments(options)
        f = open('{}_full_data.csv'.format(options['file_path'].split('.csv')[0]), 'wb')
        field_names = [
            'comment_id',
            'mk_id',
            'mk_name',
            'parent_status_id',
            'parent_status_content',
            'parent_status_link',
            'comment_link',
            'content',
            'content_processed',
            'published',
            'commentator_id',
            'commentator_also_liked_status',
            'like_count',
            'comment_count',
        ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        headers = {field_name: field_name for field_name in field_names}
        csv_data.writerow(headers)

        processor = TextProcessor()

        for i, comment in enumerate(comments):
            processed_text = processor.text_manipulation_mk_names(text=comment.content, context_status=comment.parent)
            if options['translate']:
                processed_text = processor.text_manipulation_translate_text(text=processed_text)
            processed_text = processor.text_manipulation_emojis(text=processed_text)
            print('writing comment {} of {}'.format(i + 1, comments.count()))
            dict_row = {
                'comment_id': comment.comment_id,
                'mk_id': comment.parent.feed.persona.content_object.id,
                'mk_name': processor.text_manipulation_flatten_text(comment.parent.feed.persona.content_object.name,
                                                                    delimiter=DELIMITER),
                'parent_status_id': comment.parent.status_id,
                'parent_status_content': processor.text_manipulation_flatten_text(comment.parent.content,
                                                                                  delimiter=DELIMITER),
                'parent_status_link': comment.parent.get_link,
                'comment_link': 'www.facebook.com/{}'.format(comment.comment_id),
                'content': processor.text_manipulation_flatten_text(comment.content, delimiter=DELIMITER),
                'content_processed': processor.text_manipulation_flatten_text(processed_text, delimiter=DELIMITER),
                'published': comment.published,
                'commentator_id': comment.comment_from.facebook_id,
                'commentator_also_liked_status': comment.comment_from.likes.filter(
                    status__status_id=comment.parent.status_id).exists(),
                'like_count': comment.like_count,
                'comment_count': comment.comment_count
            }
            csv_data.writerow(dict_row)

        f.close()
        print('Done.')
    def handle(self, *args, **options):
        print('Start.')

        file_name = 'content_only_{}.txt'.format(timezone.now().strftime('%Y_%m_%d_%H_%M_%S'))
        f = open(file_name, 'wb')
        field_names = [
            'content',
        ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        processor = TextProcessor()

        excluded_ids = []
        if options['exclude_from_path']:
            with open(options['exclude_from_path'], 'rb') as g:
                r = DictReader(g)
                excluded_ids = [x['comment_id'] for x in r]

        i = 0
        for status in Facebook_Status.objects_no_filters.filter(is_comment=False):
            for comment in status.comments.all():
                if comment.comment_id in excluded_ids:
                    continue
                if options['from_db']:
                    processed_text = comment.processed_content
                else:
                    processed_text = comment.content
                    processed_text = processor.replace_mk_names(text=processed_text,
                                                                context_status=comment.parent)
                if options['translate']:
                    processed_text = processor.request_translated_text_from_google(text=processed_text)
                processed_text = processor.replace_emojis_to_named_text(text=processed_text)
                print('writing comment {}'.format(i + 1))
                i += 1
                dict_row = {
                    'content': processor.flatten_text(processed_text, delimiter=DELIMITER),
                }
                csv_data.writerow(dict_row)

        f.close()
        print('Done.')
    def handle(self, *args, **options):
        print('Start.')

        file_name = 'content_only_{}.txt'.format(
            timezone.now().strftime('%Y_%m_%d_%H_%M_%S'))
        f = open(file_name, 'wb')
        field_names = [
            'content',
        ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        processor = TextProcessor()

        excluded_ids = []
        if options['exclude_from_path']:
            with open(options['exclude_from_path'], 'rb') as g:
                r = DictReader(g)
                excluded_ids = [x['comment_id'] for x in r]

        i = 0
        for status in Facebook_Status.objects_no_filters.filter(
                is_comment=False):
            for comment in status.comments.all():
                if comment.comment_id in excluded_ids:
                    continue
                if options['from_db']:
                    processed_text = comment.processed_content
                else:
                    processed_text = comment.content
                    processed_text = processor.replace_mk_names(
                        text=processed_text, context_status=comment.parent)
                if options['translate']:
                    processed_text = processor.request_translated_text_from_google(
                        text=processed_text)
                processed_text = processor.replace_emojis_to_named_text(
                    text=processed_text)
                print('writing comment {}'.format(i + 1))
                i += 1
                dict_row = {
                    'content':
                    processor.flatten_text(processed_text,
                                           delimiter=DELIMITER),
                }
                csv_data.writerow(dict_row)

        f.close()
        print('Done.')
 def write_party_name(self, csv_data, feed_ids):
     processor = TextProcessor()
     feed_to_mk_id = {x: processor.flatten_text(self.get_party_name_if_exists(x), delimiter=DELIMITER) for x in
                      feed_ids}
     feed_to_mk_id = self.add_empty_commentator_rows(feed_to_mk_id, commentator_id='MK_ID')
     csv_data.writerow(feed_to_mk_id)
Exemplo n.º 7
0
    def handle(self, *args, **options):
        print('Start.')

        comments = self.parse_comments(options)
        file_name = '{}_full_data.csv'.format(options['file_path'].split('.csv')[0])
        if options['second_stage']:
            file_name = ''.join([file_name.split('.')[0], '_2nd_stage.', file_name.split('.')[1]])
        f = open(file_name, 'wb')
        if options['second_stage']:
            field_names = [
                'comment_id',
                'MK_ID',
                'mk_name',
                'post_status_id',
                'post_content',
                'post_link',
                'comment_link',
                'comment_content',
                'comment_content_processed',
                'comment_time_of_publication',
                'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'post_like_count',
                'post_comment_count',
                'post_share_count',
                'comment_like_count',
                'comment_comment_count',
                'comment_main_language',
                'POST_LEN_MESSAGE',
                'COMMENT_LEN_MESSAGE',
                'COMMENTATOR_LIKED_POST',
                'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT',
                'IDS_OF_MKS_MENTIONED_IN_COMMENT',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST',
                'COMMENTATOR_ID',
                'POLITICAL_WING_HATNUA_LEFT',
                'POLITICAL_WING_HATNUA_CENTER',
                'IS_COALITION',
                'PARTY_NAME',
                'IS_FEMALE',
                'AGE',
                'MK_POLITICAL_STATUS',
                'MK_POLITICAL_SENIORITY',
                'IS_CURRENT_OR_PAST_PARTY_LEADER',
                'IS_CURRENT_OR_PAST_PM_CANDIDATE',
                'IS_PM',
                'POST_PUBLICATION_TIMESTAMP',
                'POST_PUBLICATION_DATE',
                'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'POST_WITH_PHOTO',
                'POST_WITH_LINK',
                'POST_WITH_VIDEO',
                'POST_WITH_STATUS',
                'POST_WITH_TEXT_ONLY',
                'POST_IN_HEBREW',
                'POST_IN_ENGLISH',
                'POST_IN_ARABIC',
                'POST_IN_OTHER',
                'DAYS_FROM_ELECTION',
                'DAYS_FROM_THREE_TEENAGER_KIDNAP',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE',
                'DAYS_FROM_DUMA_ARSON_ATTACK',
                'DAYS_FROM_THIRD_INTIFADA_START_DATE',
                'DAYS_FROM_MK_BIRTHDAY',
                'POST_PUBLISHED_ON_SATURDAY',
                'COMMENT_PUBLISHED_ON_SATURDAY',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
            ]

        else:
            field_names = [
                'comment_id',
                'mk_id',
                'mk_name',
                'parent_status_id',
                'parent_status_content',
                'parent_status_link',
                'comment_link',
                'content',
                'content_processed',
                'published',
                'commentator_id',
                'commentator_also_liked_status',
                'like_count',
                'comment_count',
                'language',
            ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        headers = {field_name: field_name for field_name in field_names}
        csv_data.writerow(headers)

        processor = TextProcessor()

        for i, comment in enumerate(comments):
            processed_text = processor.replace_mk_names(text=comment.content, context_status=comment.parent)
            if options['translate']:
                processed_text = processor.request_translated_text_from_google(text=processed_text)
            processed_text = processor.replace_emojis_to_named_text(text=processed_text)
            print('writing comment {} of {}'.format(i + 1, comments.count()))
            if options['second_stage']:
                dict_row = self.get_second_stage_comment_features(comment, processed_text, processor)
            else:
                dict_row = self.get_first_stage_comment_features(comment, processed_text, processor)
            csv_data.writerow(dict_row)

        f.close()
        print('Done.')
Exemplo n.º 8
0
    def handle(self, *args, **options):
        print('Start.')

        comments_in_file = self.parse_comments(options)
        if options['all_comments']:
            file_name_part = 'all_comments'
        else:
            file_name_part = options['file_path'].split('.csv')[0]
        file_name = '{}_full_data.csv'.format(file_name_part)
        if options['second_stage']:
            file_name = ''.join([
                file_name.split('.')[0], '_2nd_stage.',
                file_name.split('.')[1]
            ])
        f = open(file_name, 'wb')
        if options['second_stage']:
            field_names = [
                'comment_id',
                'MK_ID',
                'mk_name',
                'post_status_id',
                'post_content',
                'post_link',
                'comment_link',
                'comment_content',
                'comment_content_processed',
                'comment_time_of_publication',
                'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'post_like_count',
                'post_comment_count',
                'post_share_count',
                'comment_like_count',
                'comment_comment_count',
                'comment_main_language',
                'POST_LEN_MESSAGE',
                'COMMENT_LEN_MESSAGE',
                'COMMENTATOR_LIKED_POST',
                'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT',
                'IDS_OF_MKS_MENTIONED_IN_COMMENT',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST',
                'COMMENTATOR_ID',
                'POLITICAL_WING_HATNUA_LEFT',
                'POLITICAL_WING_HATNUA_CENTER',
                'IS_COALITION',
                'PARTY_NAME',
                'IS_FEMALE',
                'AGE',
                'MK_POLITICAL_STATUS',
                'MK_POLITICAL_SENIORITY',
                'IS_CURRENT_OR_PAST_PARTY_LEADER',
                'IS_CURRENT_OR_PAST_PM_CANDIDATE',
                'IS_PM',
                'POST_PUBLICATION_TIMESTAMP',
                'POST_PUBLICATION_DATE',
                'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'POST_WITH_PHOTO',
                'POST_WITH_LINK',
                'POST_WITH_VIDEO',
                'POST_WITH_STATUS',
                'POST_WITH_TEXT_ONLY',
                'POST_IN_HEBREW',
                'POST_IN_ENGLISH',
                'POST_IN_ARABIC',
                'POST_IN_OTHER',
                'DAYS_FROM_ELECTION',
                'DAYS_FROM_THREE_TEENAGER_KIDNAP',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE',
                'DAYS_FROM_DUMA_ARSON_ATTACK',
                'DAYS_FROM_THIRD_INTIFADA_START_DATE',
                'DAYS_FROM_MK_BIRTHDAY',
                'POST_PUBLISHED_ON_SATURDAY',
                'COMMENT_PUBLISHED_ON_SATURDAY',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'is_train',
            ]

        else:
            field_names = [
                'comment_id',
                'mk_id',
                'mk_name',
                'parent_status_id',
                'parent_status_content',
                'parent_status_link',
                'comment_link',
                'content',
                'content_processed',
                'published',
                'commentator_id',
                'commentator_also_liked_status',
                'like_count',
                'comment_count',
                'language',
                'is_train',
            ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        headers = {field_name: field_name for field_name in field_names}
        csv_data.writerow(headers)

        processor = TextProcessor()

        if options['all_comments']:
            i = 0
            for status in Facebook_Status.objects_no_filters.filter(
                    is_comment=False):
                for comment in status.comments.all():
                    print('writing comment {}'.format(i + 1))
                    i += 1
                    self.handle_comment(comment, comments_in_file, csv_data,
                                        options, processor)
        else:
            for i, comment in enumerate(comments_in_file):
                print('writing comment {} '
                      'of {}'.format(i + 1, comments_in_file.count()))
                self.handle_comment(comment, comments_in_file, csv_data,
                                    options, processor)

        f.close()
        print('Done.')