Пример #1
0
"""Heroshi URL server WSGI application."""

from base64 import b64encode
import eventlet, eventlet.pools, eventlet.wsgi
import hashlib
try:
    import yajl as json
except ImportError:
    import json
import webob
import webob.exc

from heroshi import get_logger
log = get_logger("manager.server")
from heroshi.conf import settings
from heroshi.misc import gzip_string
from heroshi.wsgi import method_dispatcher
from .manager import Manager


AUTH_HEADER = "X-Heroshi-Auth"
MIN_COMPRESS_LENGTH = 400
manager_pool = eventlet.pools.Pool(max_size=1)
manager_pool.create = Manager


class Response(webob.Response):
    default_content_type = 'text/plain'
    default_conditional_response = True

Пример #2
0
# coding: utf-8
"""Heroshi crawl reporter entry-point.

Reads \\n-separated list of JSON crawl results on stdin."""

import json
import logging, sys
from optparse import OptionParser

import heroshi, heroshi.api
from heroshi.log import update_loggers_level
log = heroshi.get_logger("cli_report")


def parse_params():
    usage_info = u"Usage: %prog [OPTION...]"
    version_info = u"Heroshi/" + heroshi.__version__
    opt_parser = OptionParser(usage_info, version=version_info)
    opt_parser.set_defaults(verbose=False, quiet=False, forever=False)
    opt_parser.add_option('-q', '--quiet', action="store_true",
                          help=u"Be quiet, don't generate any output")
    opt_parser.add_option('-v', '--verbose', action="store_true",
                          help=u"Be verbose, print detailed information")
    (options, args) = opt_parser.parse_args()
    return options, args

def main():
    options, args = parse_params()

    # set up logging
    if options.quiet:
Пример #3
0
# coding: utf-8
"""Heroshi worker: IO-worker interaction.
"""
import errno
from eventlet import sleep, with_timeout
from eventlet.queue import Event
import json
import subprocess

from heroshi import error, get_logger
log = get_logger("worker.io")


class IoWorkerDead(error.Error): pass


class Worker(object):
    """IO worker.
    """

    def __init__(self, is_closed):
        self.is_closed = is_closed
        self.results = {}
        self.worker = None

    def run_loop(self):
        """Runs io-worker until it dies.

        You SHOULD spawn this function (so it runs in separate thread).
        """
        args = ["io-worker/io-worker", "-skip-robots"]
Пример #4
0
"""Heroshi URL server implementation main module."""

__all__ = ['Manager']

import datetime
import dateutil.parser
import eventlet, eventlet.pools, eventlet.queue
from eventlet import greenthread, spawn, sleep, Queue
eventlet.monkey_patch(all=False, socket=True, select=True, psycopg=True)
try:
    import yajl as json
except ImportError:
    import json

from heroshi import TIME_FORMAT, get_logger, log_exceptions
log = get_logger("manager")
from heroshi.conf import settings
from heroshi.data import Cache
from heroshi.misc import reraise_errors
from heroshi.profile import Profile
from heroshi.storage.postgres import StorageConnection


class Manager(object):
    """Class encapsulating Heroshi URL server state."""

    def __init__(self):
        self.active = False

        self.prefetch_queue = Queue(settings.prefetch['queue_size'])
        self.prefetch_thread = spawn(self.prefetch_worker)
Пример #5
0
"""Custom profiler implementation."""

import time

from heroshi import get_logger
log = get_logger("profile")


class Profile(object):
    def __init__(self, name):
        self.name = name
        self.start_time = None

    def __enter__(self):
        self.start_time = time.time()

    def __exit__(self, exc_type, exc_value, exc_tb):
        end = time.time()
        time_passed = end - self.start_time
        log.info(u"%s: %d ms", self.name, time_passed * 1000)
        return False

    def decorate(self):
        def wrapper(func):
            def wrapped(*args, **kwargs):
                with self:
                    return func(*args, **kwargs)
            return wrapped
        return wrapper

Пример #6
0
sends crawl info back to queue server."""

from datetime import datetime
import eventlet
from eventlet import GreenPool, greenthread, sleep, spawn, with_timeout
from eventlet.queue import Empty, Queue
import httplib2
import json
import random, time, urllib, urlparse
import robotparser
import sys

from heroshi import TIME_FORMAT
from heroshi import api, error, get_logger

log = get_logger("worker.Crawler")
from heroshi.conf import settings
from heroshi.data import PoolMap
from heroshi.error import ApiError, CrawlError, FetchError, RobotsError
from heroshi.misc import reraise_errors
from heroshi.worker import io

eventlet.monkey_patch(all=False, os=True, socket=True, select=True)


class Stop(error.Error):
    pass


class Crawler(object):
    def __init__(self, max_connections, input_is_plain):
Пример #7
0
"""PostgreSQL storage backend for Heroshi."""

__all__ = ['StorageConnection']

import base64
from datetime import datetime
from itertools import imap
from functools import partial
import hashlib
import json
import psycopg2
import psycopg2.extensions
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)

from heroshi import TIME_FORMAT, get_logger
log = get_logger("storage.postgres")
from heroshi.conf import settings
from heroshi.error import StorageError
from . import dbhelpers, sql


RANDOMIZER_K = 50
RECHECK_INTERVAL = '2 days'
TABLE = 'metadata'


def row_factory(columns, values):
    row = dbhelpers.dict_factory(columns, values)
    row.update(json.loads(row.pop('var') or "{}"))
    row['headers'] = json.loads(row['headers'] or "{}")
    return row
Пример #8
0
Crawler uses these helpers to communicate with URL server."""

from eventlet.pools import Pool
import httplib2

try:
    import yajl as json
except ImportError:
    import json

import socket
from urllib import urlencode

from heroshi import get_logger
log = get_logger("api")
from heroshi.conf import settings
from heroshi.error import ApiError


manager_connections = Pool(max_size=2)
manager_connections.create = lambda: httplib2.Http(timeout=20)


def request_manager(resource, method, data=None, headers=None):
    use_headers = {
        'User-Agent': settings.identity['user_agent'],
        'X-Heroshi-Auth': settings.api_key,
        'Expect': '', # a try to fix result: 100 not-ok problem
    }
    if headers is not None:
Пример #9
0
"""Heroshi Postgres database helpers."""

from psycopg2 import DatabaseError, IntegrityError

from heroshi import get_logger
log = get_logger("storage.dbhelpers")
from heroshi.error import StorageError
from . import sql


class DbRow(object):
    def __repr__(self):
        return u"<DbRow %s>" % ", ".join("%s=%s" % (n, v) for (n,v) in self.__dict__.iteritems())


def obj_factory(columns, row):
    r = DbRow()
    r.__dict__ = dict_factory(columns, row)
    return r

def dict_factory(columns, row):
    return dict( (column, row[index]) for index, column
                 in enumerate(columns) )

def fetch(cursor, factory=None):
    """Fetches all results from cursor as *list* of `factory()` items.

    Default factory is `dict_factory`, it makes dict
    with column names as keys.

    You can also use `obj_factory`, it makes `DbRow` objects