def getHNData(verbose=False, limit=100, sub="showstories"):
    from hackernews import HackerNews
    from hackernews import settings
    import hoverpy, time, os
    dbpath = "data/hn.%s.db" % sub
    with hoverpy.HoverPy(recordMode="once", dbpath=dbpath) as hp:
        if not hp.mode() == "capture":
            settings.supported_api_versions[
                "v0"] = "http://hacker-news.firebaseio.com/v0/"
        hn = HackerNews()
        titles = []
        print("GETTING HACKERNEWS %s DATA" % sub)
        subs = {
            "showstories": hn.show_stories,
            "askstories": hn.ask_stories,
            "jobstories": hn.job_stories,
            "topstories": hn.top_stories
        }
        start = time.time()
        for story_id in subs[sub](limit=limit):
            story = hn.get_item(story_id)
            if verbose:
                print(story.title.lower())
            titles.append(story.title.lower())
        print("got %i hackernews titles in %f seconds" %
              (len(titles), time.time() - start))
        return titles
def getRedditData(verbose=False, comments=True, limit=100, sub="all"):
    import hoverpy, praw, time
    dbpath = ("data/reddit.%s.db" % sub)
    with hoverpy.HoverPy(recordMode='once', dbpath=dbpath,
                         httpsToHttp=True) as hp:
        titles = []
        print "GETTING REDDIT r/%s DATA" % sub
        r = praw.Reddit(user_agent="Karma breakdown 1.0 by /u/_Daimon_",
                        http_proxy=hp.httpProxy(),
                        https_proxy=hp.httpProxy(),
                        validate_certs="off")
        if not hp.mode() == "capture":
            r.config.api_request_delay = 0
        subreddit = r.get_subreddit(sub)
        for submission in subreddit.get_hot(limit=limit):
            text = submission.title.lower()
            if comments:
                flat_comments = praw.helpers.flatten_tree(submission.comments)
                for comment in flat_comments:
                    text += comment.body + " " if hasattr(comment,
                                                          'body') else ''
            if verbose:
                print text
            titles.append(text)
        return titles
Exemplo n.º 3
0
import time
import hoverpy
import requests
import os

prot = "http" if os.path.isfile("hn.db") else "https"

with hoverpy.HoverPy(recordMode='once', dbpath='hn.db') as hp:
    print("started hoverpy in %s mode" % hp.mode())
    start = time.time()
    r = requests.get("%s://hacker-news.firebaseio.com/v0/topstories.json" %
                     (prot))
    for item in r.json():
        print(
            requests.get("%s://hacker-news.firebaseio.com/v0/item/%i.json" %
                         (prot, item)).json()["title"])
    print("got articles in %f seconds" % (time.time() - start))
Exemplo n.º 4
0
import hoverpy
import praw
import os
import time

sub = "python"
db = ("%s.db" % sub)
capture = not os.path.isfile(db)

with hoverpy.HoverPy(dbpath=db, recordMode='once') as hp:
    start = time.time()
    titles = []
    print "GETTING REDDIT r/%s DATA" % sub
    r = praw.Reddit(user_agent="Karma breakdown 1.0 by /u/_Daimon_",
                    http_proxy=hp.httpProxy(),
                    https_proxy=hp.httpsProxy(),
                    validate_certs="off")
    if not capture:
        r.config.api_request_delay = 0
    subreddit = r.get_subreddit(sub)
    for submission in subreddit.get_hot(limit=100):
        text = submission.title.lower()
        print(text)
        for comment in praw.helpers.flatten_tree(submission.comments):
            if hasattr(comment, 'body'):
                text += comment.body + " "
    titles.append(text)
    print("got %i %s in %f" % (len(titles), sub, time.time() - start))
Exemplo n.º 5
0
import time
import hoverpy
import requests

rtd = "http://readthedocs.org/api/v1/project/?limit=50&offset=0&format=json"

with hoverpy.HoverPy(recordMode='once'):
    start = time.time()
    objects = requests.get(rtd).json()['objects']
    links = ["http://readthedocs.org" + x['resource_uri'] for x in objects]
    for link in links:
        response = requests.get(link)
        print("url: %s, status code: %s" % (link, response.status_code))
    print("Time taken: %f" % (time.time() - start))