Пример #1
0
def process_comment_urls(udb, ulimit=100000, number_of_processes=4):
    print('---EXTRACTING COMMENT URLS')
    totalcompleted = 0
    if ulimit == 0:
        ulimit = None
    total_to_process = Comment.select().where(
        Comment.number_urls.is_null()).count()
    if ulimit is not None and total_to_process > ulimit:
        total_to_process = ulimit
    with tqdm(total=total_to_process) as pbar:
        while totalcompleted < total_to_process:
            with udb.atomic():
                queue_tasks = [(comment.id, comment.body)
                               for comment in Comment.select().where(
                                   Comment.number_urls.is_null()).limit(ulimit)
                               ]
            # Create queues
            task_queue = Queue()  # ctx.Queue()  #
            done_queue = Queue()  # ctx.Queue()  #

            # Submit tasks
            for task in queue_tasks:
                task_queue.put(task)

            # Start worker processes
            for i in range(number_of_processes):
                Process(target=url_worker,
                        args=(task_queue, done_queue)).start()

            for i in range(len(queue_tasks)):
                comment_id, url_set = done_queue.get()
                try:
                    with udb.atomic():
                        Comment.update(number_urls=len(url_set)).where(
                            Comment.id == comment_id).execute()
                        for url in url_set:
                            url, urlcreated = Url.get_or_create(link=url)
                            try:
                                CommentLinks.insert(
                                    comment=comment_id,
                                    url=url.id).on_conflict_ignore().execute()
                            except SQLError:
                                print(comment_id, url.id)
                                raise
                except KeyboardInterrupt:
                    quit()

                pbar.update(1)
                totalcompleted += 1

            # Tell child processes to stop
            for i in range(number_of_processes):
                task_queue.put('STOP')
    """
Пример #2
0
def reddit_comment_update(appcfg, update_length=604800):
    print('     ---UPDATING COMMENTS WITH DATA FROM THE REDDIT API')
    totalnumber = Comment.select().where(
        (Comment.retrieved_on - Comment.created_utc) < update_length).count()
    needs_update_list = list()
    needs_update = Comment.select().where(
        (Comment.retrieved_on - Comment.created_utc) < update_length)
    print(
        '         ---Building Task List.  This could take a while for large subreddits'
    )

    with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as nbar:
        for dbcomment in needs_update:
            fullname = "t1_{}".format(dbcomment.comment_id)
            needs_update_list.append(fullname)
            nbar.update(1)
    needs_update_list = list(chunks(needs_update_list, 100))
    print(
        '         ---Accessing data from Reddit API and entering into database'
    )
    with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as pbar:
        for nlist in needs_update_list:
            try:
                rd_comments = list(r.info(nlist))
            except RequestException:
                print("Connection Error to Reddit API. Exiting...")
                # quit()
                return
            with appcfg.database.atomic():
                for rdcomment in rd_comments:
                    updatedtime = arrow.now().timestamp
                    if rdcomment.author is None and rdcomment.body == '[deleted]':
                        Comment.update(
                            score=rdcomment.score,
                            retrieved_on=updatedtime,
                            deleted=True).where(
                                Comment.comment_id == rdcomment.id).execute()
                        """
                    elif rdcomment.body == '[deleted]':
                        Comment.update(score=rdcomment.score,
                                       retrieved_on=updatedtime,
                                       deleted=False).where(Comment.comment_id == rdcomment.id).execute()
                    elif rdcomment.author is None:
                        Comment.update(score=rdcomment.score,
                                       # body=rdcomment.body_html,
                                       retrieved_on=updatedtime,
                                       deleted=True).where(Comment.comment_id == rdcomment.id).execute()
                        """
                    else:
                        Comment.update(
                            score=rdcomment.score,
                            # body=rdcomment.body_html,
                            retrieved_on=updatedtime,
                            deleted=False).where(
                                Comment.comment_id == rdcomment.id).execute()
                    pbar.update(1)
Пример #3
0
def process_comments(appcfg):
    # Get newest comments with two week overlap
    print('   PROCESSING NEWEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit)

    try:
        newest_utc = int(
            Comment.select(fn.MAX(Comment.created_utc)).scalar().timestamp())
    except (TypeError, AttributeError):
        newest_utc = None
    if newest_utc is not None:
        oldestdate = newest_utc  # - 1209600  # two weeks overlap, in seconds
    else:
        oldestdate = appcfg.oldestdate

    try:
        comment_id_set = get_push_comments(appcfg, appcfg.newestdate,
                                           oldestdate)
    except (ConnectionError, SSLError, ChunkedEncodingError):
        comment_id_set = None
        print("     Connection Error for Pushshift API.  Quitting...")
        # quit()
        return comment_id_set

    # Get oldest comments in case progress was interrupted, with two week overlap
    try:
        oldest_utc = int(
            Comment.select(fn.MIN(Comment.created_utc)).scalar().timestamp())
    except (TypeError, AttributeError):
        oldest_utc = None
    if oldest_utc is not None:
        newestdate = oldest_utc  # + 1209600  # two weeks overlap, in seconds
    else:
        newestdate = appcfg.newestdate
    print('   PROCESSING OLDEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit)

    try:
        old_comment_id_set = get_push_comments(appcfg, newestdate,
                                               appcfg.oldestdate)
    except (ConnectionError, SSLError, ChunkedEncodingError):
        old_comment_id_set = None
        print("     Connection Error for Pushshift API.  Quitting...")
        # quit()
        return old_comment_id_set
    comment_id_set |= old_comment_id_set
    filedate = arrow.now().timestamp
    basedir = "/rpa" if os.environ.get('DOCKER', '0') == '1' else '.'
    coutput_file_path = "{basedir}/{subreddit}_comments_{timestamp}.txt".format(
        basedir=basedir, subreddit=appcfg.subreddit, timestamp=filedate)

    # with open(coutput_file_path, 'w', encoding='UTF-8') as comment_file:
    #     comment_file.writelines(comment_id_set)
    print("     Total comments submitted to", appcfg.subreddit, "in set:",
          len(comment_id_set))
    deleted = Author.get_or_none(name='[deleted]')
    if deleted is not None:
        cupdatet = Comment.update(deleted=True).where(
            (Comment.author == deleted.id)
            & (Comment.deleted.is_null() or Comment.deleted == 0)).execute()
        print(
            '     Updated deleted field in comments.  Set deleted = True for',
            cupdatet, 'records.')
        cupdatef = Comment.update(
            deleted=False).where((Comment.author != deleted.id)
                                 & (Comment.deleted.is_null())).execute()
        print(
            '     Updated deleted field in comments.  Set deleted = False for',
            cupdatef, 'records.')
Пример #4
0
def get_push_comments(appcfg, newestdate, oldestdate):
    subnumber = 1
    sub, subcreated = Subreddit.get_or_create(name=appcfg.subreddit)
    sub_id = sub.id
    totalsubnumber = 0
    push_comment_id_set = set()
    total_available = "https://api.pushshift.io/reddit/search/comment/?subreddit={subreddit}" \
                      "&after={oldestdate}&before={newestdate}&aggs=subreddit&size=0"
    turl = total_available.format(subreddit=appcfg.subreddit,
                                  oldestdate=oldestdate,
                                  newestdate=newestdate)
    # newestdate = appcfg.newestdate
    with requests.get(turl) as tp:
        if tp.status_code != 200:
            print("Connection Error for Pushshift API, quitting...")
            # quit()
            return push_comment_id_set
        tpush = tp.json()
    try:
        total_comments = tpush['aggs']['subreddit'][0]['doc_count']
    except (IndexError, KeyError):
        print("     No new comments to process from pushshift API for",
              appcfg.subreddit)
        return push_comment_id_set
    linktemplate = "https://api.pushshift.io/reddit/search/comment/?subreddit={subreddit}" \
                   "&after={oldestdate}&before={newestdate}&sort=desc&size=500"
    with tqdm(total=total_comments, ncols=100, dynamic_ncols=False) as pbar:
        while subnumber > 0:
            url = linktemplate.format(subreddit=appcfg.subreddit,
                                      oldestdate=oldestdate,
                                      newestdate=newestdate)
            with requests.get(url) as rp:
                try:
                    push = rp.json()
                except JSONDecodeError:
                    print("     JSON DECODE ERROR on Pushshift API Comments",
                          url)
                    time.sleep(10)
                    continue
                    # return push_comment_id_set
            subnumber = len(push['data'])
            totalsubnumber += subnumber
            commentlinktemplate = 'https://www.reddit.com/comments/{link_id}/_/{comment_id}/.json\n'
            with appcfg.database.atomic():
                for item in push['data']:
                    if 'id' not in item.keys():
                        print('The following item has no primary comment ID:',
                              item)
                        continue
                    else:
                        item['comment_id'] = item.pop('id')
                    try:
                        link_id = item['link_id']
                        item['link_id'] = link_id.replace('t3_', '')
                        commentlink = commentlinktemplate.format(
                            link_id=item['link_id'],
                            comment_id=item['comment_id'])
                        push_comment_id_set.add(commentlink)
                    except KeyError:
                        print('The following item has no submission link ID:',
                              item)
                        continue
                    if item['created_utc'] < newestdate:
                        newestdate = item['created_utc']
                    item['subreddit'] = sub_id
                    if 'author_flair_text' in item.keys(
                    ) and item['author_flair_text'] is not None:
                        author_flair, author_flaircreated = AuthorFlair.get_or_create(
                            text=item['author_flair_text'])
                        item['author_flair'] = author_flair.id
                    else:
                        item['author_flair'] = None
                    author, author_created = Author.get_or_create(
                        name=item['author'])
                    item['author'] = author.id
                    itemfields = Comment._meta.fields.keys()
                    insertdict = dict()
                    for key in item.keys():
                        if key in itemfields:
                            insertdict[key] = item[key]
                    Comment.insert(insertdict).on_conflict_ignore().execute()
            pbar.update(subnumber)
    return push_comment_id_set