Пример #1
0
def checkLogin():
    global isLoggedIn

    cur.execute("select cookie, modhash from session")
    if cur.rowcount is 0:
        return
    else:
        for s in cur.fetchall():
            opener.addheaders.append(('Cookie', 'reddit_session=%s' % s[0]))
            opener.addheaders.append(('X-Modhash', s[1]))

    try:
        success = False
        for i in range(_['http_retries']):
            f = opener.open('http://www.reddit.com/api/me.json')
            if f.getcode() == 200:
                success = True
                break
            else:
                log.write('Error %d for login status check attempt' % f.getcode(), 'error')
                if f.getcode() in [401, 403, 404]: 
                    return
                time.sleep(_['sleep'])

        if success == False:
            log.write('Retries exhausted for login status check', 'error');
            return

        time.sleep(_['sleep'])

    except Exception, e:
        log.write('Error checking login status: %e' %e, 'error')
        return
Пример #2
0
def getCommentTree(nodes, url, linkid, commentid, args, depth):
    global ccount

    for node in nodes:
        try:
            if node is None:
                break

            elif node['kind'] == 't1':
                try:
                    cur.execute("""replace into t1 (
                                    id,
                                    link_id,
                                    parent_id,
                                    body, 
                                    author,
                                    created,
                                    last_seen
                                ) values (%s, %s, %s, %s, %s, %s, now())""", (
                                    lib.base36decode(node['data']['id']), 
                                    node['data']['link_id'],
                                    node['data']['parent_id'], 
                                    node['data']['body'], 
                                    node['data']['author'], 
                                    datetime.datetime.fromtimestamp(node['data']['created_utc'])
                                ))
                    db.commit()
                    ccount += 1

                    if node['data']['replies'] != "":
                        getCommentTree([node['data']['replies']], url, linkid, commentid, args, depth)

                except Exception, e:
                    log.write('Error storing t1_' + node['data']['id'] + ': %s' % e, 'error')
                    db.rollback()

            elif node['kind'] == "Listing":
                getCommentTree(node['data']['children'], url, linkid, commentid, args, depth)

            elif node['kind'] == "more":
                if _['autoget_lte_20'] and node['data']['count'] <= 20 and node['data']['count'] >= _['autoget_threshold']:
                    children = ",".join(node['data']['children'])
                    time.sleep(_['sleep'])
                    get('http://www.reddit.com/api/morechildren/', linkid, "", "api_type=json&depth=8&link_id=%s&children=%s" % (linkid, children), 0, True)

                elif node['data']['count'] >= _['comment_traverse_threshold']:
                    if node['data']['parent_id'] == linkid or node['data']['parent_id'] == commentid:
                        #sibling traversal
                        breadth = 0
                        for child in node['data']['children']:
                            if breadth >= _['comment_siblings_total']:
                                break
                            time.sleep(_['sleep'])
                            get(url, linkid, child, args, depth)
                            breadth += 1
                    else:
                        #child traversal
                        time.sleep(_['sleep'])
                        get(url, linkid, node['data']['parent_id'][3:], args, depth + 1)
Пример #3
0
def printStats():
    cur.execute("select count(*) from crawl_locations")
    locationTotal = cur.fetchone()[0]

    linkLocations = len(linkTimes['counts'])
    linkCount = sum(linkTimes['counts'])
    linkElapsedTime = sum(linkTimes['times'])
    if linkCount == 0: linkRate = 0
    else: linkRate = linkElapsedTime / linkCount
    cur.execute("select count(*) from t3")
    linkTotal = cur.fetchone()[0]

    commentLocations = len(commentTimes['counts'])
    commentCount = sum(commentTimes['counts'])
    commentElapsedTime = sum(commentTimes['times'])
    if commentCount == 0: commentRate = 0
    else: commentRate = commentElapsedTime / commentCount
    cur.execute("select count(*) from t1")
    commentTotal = cur.fetchone()[0]

    responseLocations = len(responseTimes['counts'])
    responseCount = sum(responseTimes['counts'])
    responseElapsedTime = sum(responseTimes['times'])
    if responseCount == 0: responseRate = 0
    else: responseRate = responseElapsedTime / responseCount
    cur.execute("select count(*) from responses")
    responseTotal = cur.fetchone()[0]

    totalElapsed = time.time() - startTime

    log.write("%d link(s) / %f sec. (%f sec. ea.) in %d location(s)" % (linkCount, linkElapsedTime, linkRate, linkLocations), 'stat')
    log.write("%d comment(s) / %f sec. (%f sec. ea.) in %d thread(s)" % (commentCount, commentElapsedTime, commentRate, commentLocations), 'stat')
    log.write("%d response(s) / %f sec. (%f sec. ea.) in %d thread(s)" % (responseCount, responseElapsedTime, responseRate, responseLocations), 'stat')
    log.write("%d location(s) / %d link(s) / %d comment(s) / %d responses currently in database" % (locationTotal, linkTotal, commentTotal, responseTotal), 'stat')
    log.write("Execution took %f sec. (%f minutes)" % (totalElapsed, totalElapsed / 60), 'stat')
Пример #4
0
def build(crawl_subreddits, crawl_urls):
    urls = []

    log.write("Building location list...", 'message')

    for subreddit in crawl_subreddits:
        for sort in subreddit['sort']:
            if sort == "all": sort = ""
            urls.append("http://www.reddit.com/r/" + subreddit['subreddit'] + "/" + sort + ".json")

    for url in crawl_urls:
        urls.append(url + ".json")

    for url in urls:
        try:
            cur.execute("select id from crawl_locations where url = %s", (url,))
            if cur.rowcount > 0:
                cur.execute("update crawl_locations set last_seen = now() where url = %s", (url,))
            else:
                cur.execute("""insert into crawl_locations (
                                url,
                                last_seen,
                                last_crawled
                            ) values (%s, now(), 0)""", (
                                url
                            ))
            db.commit()

        except Exception, e:
            log.write('Error storing location: ' + url + ': %s' % e, 'error')
            db.rollback()
            
Пример #5
0
from conf import _

from init import db, cur
import comments
import lib
import links
import locations
import log
import respond
import stats
import user

# Delete old links and comments
if 'runall' in argv or 'cleanup' in argv:
    if _['delete_links_after'] > -1: cur.execute("delete from t3 where created < date_sub(now(), interval %s second)", (_['delete_links_after'],))
    if _['delete_comments_after'] > -1: cur.execute("delete from t1 where created < date_sub(now(), interval %s second)", (_['delete_comments_after'],))
    db.commit();

# Build/store locations to retrieve links
if 'runall' in argv or 'locations' in argv:
    locations.build(_['crawl_subreddits'], _['crawl_urls'])

# Crawls URLS from locations
if 'runall' in argv or 'links' in argv:
    cur.execute("select id, url from crawl_locations where last_crawled < date_sub(now(), interval %s second)", (_['find_links_after'],))
    for l in cur.fetchall():
        links.get("%s?limit=%d" % (l[1], _['links_per_page']))
        cur.execute("update crawl_locations set last_crawled = now() where id = %s", (l[0],))
        db.commit()
Пример #6
0
        rJSON = f.read()
        f.close()

        try: links = json.loads(rJSON)
        except Exception, e:
            log.write('Error parsing links url: %s - %s' % (finalUrl, e), 'error')
            return

        after = links['data']['after']

        for l in links['data']['children']:
            try:
                if l['kind'] == 't3':
                    try:
                        cur.execute("select id from t3 where id = %s", (lib.base36decode(l['data']['id']),))
                        if cur.rowcount > 0:
                            cur.execute("update t3 set last_seen = now() where id = %s", (lib.base36decode(l['data']['id']),))
                        else:
                            if l['data']['is_self']: content = l['data']['selftext']
                            else: content = None;
                            
                            cur.execute("""insert into t3 (
                                            id, 
                                            title, 
                                            url, 
                                            permalink, 
                                            content,
                                            author,
                                            created,
                                            last_seen,
Пример #7
0
    except Exception, e:
        log.write('Error checking login status: %e' %e, 'error')
        return

    rJSON = f.read()
    f.close()

    try: res = json.loads(rJSON)
    except Exception, e:
        log.write('Error parsing login status response: %s' % e, 'error')
        return

    if 'data' in res:
        opener.addheaders.append(('X-Modhash', res['data']['modhash']))
        try:
            cur.execute("update session set modhash = %s", (res['data']['modhash'],))
            db.commit()
        except Exception, e:
            log.write('Error updating modhash: %s' % e, 'error')
            return
        isLoggedIn = True


def login():
    global isLoggedIn
    
    log.write('Logging in user %s' % _['reddit_username'], 'message')

    try: 
        success = False
        for i in range(_['http_retries']):
Пример #8
0
from conf import _

from init import db, cur, opener
import lib
import log
import stats
import user
import userfunctions

quotedRE = re.compile("^&gt;.*$", re.I | re.M)

for i in range(len(_["rules"])):
    if "regex" in _["rules"][i]:
        _["rules"][i]["re"] = re.compile(_["rules"][i]["regex"], re.I | re.M)

cur.execute("select distinct thing_id from responses")
responses = cur.fetchall()

rcount = 0


def processComment(cid, body, author):
    for rule in _["rules"]:
        if "flags" in rule and "ignoreQuotedText" in rule["flags"]:
            body = re.sub(quotedRE, "", body)

        if "flags" not in rule or ("flags" in rule and "selftextOnly" not in rule["flags"]):
            if "user_function" in rule:
                try:
                    getattr(userfunctions, rule["user_function"])(cid, body, author)
                except Exception, e: