예제 #1
0
class JsonEncoderTestCase(unittest.TestCase):

    def setUp(self):
        self.encoder = ScrapyJSONEncoder()

    def test_encode_decode(self):
        dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
        dts = "2010-01-02 10:11:12"
        d = datetime.date(2010, 1, 2)
        ds = "2010-01-02"
        t = datetime.time(10, 11, 12)
        ts = "10:11:12"
        dec = Decimal("1000.12")
        decs = "1000.12"

        for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts),
                              (dec, decs), (['foo', d], ['foo', ds])]:
            self.assertEqual(self.encoder.encode(input), json.dumps(output))

    def test_encode_deferred(self):
        self.assertIn('Deferred', self.encoder.encode(defer.Deferred()))

    def test_encode_request(self):
        r = Request("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.method, rs)
        self.assertIn(r.url, rs)

    def test_encode_response(self):
        r = Response("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.url, rs)
        self.assertIn(str(r.status), rs)
예제 #2
0
class JsonEncoderTestCase(unittest.TestCase):
    def setUp(self):
        self.encoder = ScrapyJSONEncoder()

    def test_encode_decode(self):
        dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
        dts = "2010-01-02 10:11:12"
        d = datetime.date(2010, 1, 2)
        ds = "2010-01-02"
        t = datetime.time(10, 11, 12)
        ts = "10:11:12"
        dec = Decimal("1000.12")
        decs = "1000.12"

        for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts),
                              (dec, decs), (['foo', d], ['foo', ds])]:
            self.assertEqual(self.encoder.encode(input), json.dumps(output))

    def test_encode_deferred(self):
        self.assertIn('Deferred', self.encoder.encode(defer.Deferred()))

    def test_encode_request(self):
        r = Request("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.method, rs)
        self.assertIn(r.url, rs)

    def test_encode_response(self):
        r = Response("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        self.assertIn(r.url, rs)
        self.assertIn(str(r.status), rs)
class KafkaPipeline(object):
    def __init__(self, producer, topic):
        self.producer = producer
        self.topic = topic
        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)

        # Key is null, value is JSON object.
        #(null, {
        #"category": "international",
        #"title": "Its Charter Expired, Export-Import Bank Will Keep the Doors Open",
        #"author": "By JACKIE CALMES",
        #"spider": "NYtimes",
        #"link": "http://www.nytimes.com/2015/07/01/business/international/though-charter-is-expiring-export-import-bank-will-keep-its-doors-open.html",
        #"date": "June 30, 2015",
        #"article": ["Advertisemen.."]

        self.producer.send(self.topic, msg)

    @classmethod
    def from_settings(cls, settings):
        kafka_hosts = settings.get('SCRAPY_KAFKA_HOSTS')
        topic = settings['SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC']

        producer = KafkaProducer(bootstrap_servers=kafka_hosts)
        return cls(producer, topic)
예제 #4
0
class RedisPipeline(object):

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        ip_key = self.item_ip_key(item, spider)
        data = self.encoder.encode(item)
        self.server.lpush(key, data)
        self.server.lpush(ip_key, data)
        return item

    def item_key(self, item, spider):
        return "%s:items:all" % spider.name

    def item_ip_key(self, item, spider):
        return "%s:items:%s" % (spider.name, item['ip'])
예제 #5
0
class RedisPipeline(object):
    """Pushes serialized item into a redis list/queue"""

    def __init__(self, host, port, queue_type):
        self.server = redis.Redis(host, port)
        self.encoder = ScrapyJSONEncoder()
	self.queue_type = queue_type

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.get('REDIS_PORT', 6379)
        queue_type = settings.get('QUEUE_TYPE', 'FIFO')
        return cls(host, port)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(dict(item))
        self.server.lpush(key, data)
	if (self.queue_type == 'LIFO'):
            self.server.lpush(key, data)
	else:
            self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "%s:items" % spider.name
예제 #6
0
class RedisPipeline(object):
  """Pushes serialized item into a redis list/queue"""

  def __init__(self, host, port):
    self.server = redis.Redis(host, port)
    self.encoder = ScrapyJSONEncoder()

  @classmethod
  def from_settings(cls, settings):
    host = settings.get('REDIS_HOST', 'localhost')
    port = settings.get('REDIS_PORT', 6379)
    return cls(host, port)

  def process_item(self, item, spider):
    return deferToThread(self._process_item, item, spider)

  def _process_item(self, item, spider):
    key = self.item_key(item, spider)
    data = self.encoder.encode(dict(item))
    self.server.rpush(key, data)
    return item

  def item_key(self, item, spider):
    """Returns redis key based on given spider"""
    return "%s:items" % spider.name
예제 #7
0
class DockerhubExtension(object):

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.crawler = crawler
        self.job_path = crawler.settings.get('JOB_PATH')
        if not self.job_path:
            raise NotConfigured('no JOB_PATH set')

        self.json_encoder = ScrapyJSONEncoder()
        self.looping_call = LoopingCall(self.store_job_info)
        self.looping_call.start(5)
        crawler.signals.connect(self.store_job_info,
                                signal=signals.spider_closed)

    def store_job_info(self):
        with open(self.job_path, 'w') as f:
            stats = self.crawler.stats.get_stats()
            job_info = {
                'stats': stats
            }
            job_info_json = self.json_encoder.encode(job_info)
            f.write(job_info_json)
예제 #8
0
class RedisPipeline(object):
    """Pushes serialized item into a redis list/queue"""

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "%s:items" % spider.name
예제 #9
0
class RabbitMQPipeline(object):
    """Pushes serialized item into a RabbitMQ list/queue"""

    def __init__(self, server, exchange_name):
        self.server = server
        self.exchange_name = exchange_name
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server, redis_server = connection.from_settings(settings)
        exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME)
        return cls(server, exchange_name)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.basic_publish(exchange=self.exchange_name,
                                  routing_key=key,
                                  body=data)
        return item

    def item_key(self, item, spider):
        """Returns RabbitMQ key based on given spider"""
        return "%s:items" % spider.name
예제 #10
0
class RabbitMQItemPublisherPipeline(object):
    def __init__(self, connect_url, exchange_name, routing_key, queue_name):
        self.connect_url = connect_url
        self.connection = RabbitMQConnection(connect_url,
                                             exchange_name=exchange_name,
                                             routing_key=routing_key,
                                             queue_name=queue_name)
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            connect_url=crawler.settings.get("RABBITMQ_RESULT_URI"),
            exchange_name=crawler.settings.get("RABBITMQ_RESULT_EXCHANGE"),
            routing_key=crawler.settings.get("RABBITMQ_RESULT_ROUTING_KEY"),
            queue_name=crawler.settings.get("RABBITMQ_RESULT_QUEUE"),
        )

    def close_spider(self, spider):
        self.connection.close()

    def process_item(self, item, spider):
        data = self.encoder.encode(item)
        self.connection.publish(body=data,
                                headers={'model': item.get('model', None)},
                                routing_key=getattr(spider,
                                                    'amqp_result_routing_key',
                                                    None))
        return item
예제 #11
0
class RedisPipeline(object):
    """
    Pushes serialized item into a redis.
    Specific for SocialSpiders
    """

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.set(key, data.decode('utf-8'))
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "{}_{}".format(spider.name, item['search_name'])
예제 #12
0
class ScrapyLearningPipeline(object):
    def __init__(self):
        kafka_ip_port = settings.KAFKA_IP_PORT
        kafka_topic = settings.KAFKA_TOPIC
        if len(kafka_ip_port) == 1:
            kafka_ip_port = kafka_ip_port[0]
        else:
            if isinstance(kafka_ip_port, list):
                kafka_ip_port = ",".join(kafka_ip_port)
            else:
                kafka_ip_port = kafka_ip_port
        self._client = KafkaClient(hosts=kafka_ip_port)
        self._producer = self._client.topics[kafka_topic.encode(
            encoding="UTF-8")].get_producer()
        self._encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        item = dict(item)
        item['spider'] = spider.name
        msg = self._encoder.encode(item)
        print(msg)
        self._producer.produce(msg.encode(encoding="UTF-8"))
        # self._producer.produce(item['url'].encode(encoding="UTF-8"))
        return item

    def close_spider(self, spider):
        self._producer.stop()
예제 #13
0
class JsonItemExporter(BaseItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        # there is a small difference between the behaviour or JsonItemExporter.indent
        # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
        # the addition of newlines everywhere
        json_indent = self.indent if self.indent is not None and self.indent > 0 else None
        kwargs.setdefault('indent', json_indent)
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def _beautify_newline(self):
        if self.indent is not None:
            self.file.write(b'\n')

    def start_exporting(self):
        self.file.write(b"[")
        self._beautify_newline()

    def finish_exporting(self):
        self._beautify_newline()
        self.file.write(b"]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',')
            self._beautify_newline()
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict)
        self.file.write(to_bytes(data, self.encoding))
예제 #14
0
class RabbitMQPipeline(object):
    """Pushes serialized item into a RabbitMQ list/queue"""
    def __init__(self, server, exchange_name):
        self.server = server
        self.exchange_name = exchange_name
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = from_settings(settings)
        exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME)
        return cls(server, exchange_name)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.basic_publish(exchange=self.exchange_name,
                                  routing_key=key,
                                  body=data)
        return item

    def item_key(self, item, spider):
        """Returns RabbitMQ key based on given spider"""
        return "%s:items" % spider.name
예제 #15
0
class RedisPipeline(object):
    """Pushes serialized item into a scrapy_redis list/queue"""

    def __init__(self, host, port):
        self.server = redis.Redis(host, port)
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        host = settings.get("REDIS_HOST", "localhost")
        port = settings.get("REDIS_PORT", 6379)
        return cls(host, port)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(dict(item))
        self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        """Returns scrapy_redis key based on given spider"""
        return "%s:items" % spider.name
예제 #16
0
class RabbitMQPipeline(object):
    """Pushes serialized item into a RabbitMQ list/queue"""

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.basic_publish(exchange='',
                                  routing_key=key,
                                  body=data)
        return item

    def item_key(self, item, spider):
        """Returns RabbitMQ key based on given spider"""
        return "%s:items" % spider.name
예제 #17
0
class APICallerPipeline(object):
    def __init__(self):
        pass

    def open_spider(self, spider):
        self.encoder = ScrapyJSONEncoder()

    def close_spider(self, spider):
        return True

    def process_item(self, item, spider):

        url = "http://127.0.0.1:5000/books"
        headers = {'Content-Type': 'application/json'}
        response_data = requests.request("POST",
                                         url,
                                         data=self.encoder.encode(item),
                                         headers=headers)

        if response_data.status_code == 201:
            print("The book '{}' has been saved into the database".format(
                item['title']))
        return item

    def checkIfAlreadyExists(self, item):
        pass
예제 #18
0
class RedisPipeline(object):
    """
    Pushes serialized item into a redis.
    Specific for SocialSpiders
    """
    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.set(key, data.decode('utf-8'))
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "{}_{}".format(spider.name, item['search_name'])
예제 #19
0
class JsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        # there is a small difference between the behaviour or JsonItemExporter.indent
        # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
        # the addition of newlines everywhere
        json_indent = self.indent if self.indent is not None and self.indent > 0 else None
        kwargs.setdefault('indent', json_indent)
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def _beautify_newline(self):
        if self.indent is not None:
            self.file.write(b'\n')

    def start_exporting(self):
        self.file.write(b"[")
        self._beautify_newline()

    def finish_exporting(self):
        self._beautify_newline()
        self.file.write(b"]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',')
            self._beautify_newline()
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict)
        self.file.write(to_bytes(data, self.encoding))
예제 #20
0
class HadoopExporter(BaseItemExporter):
    def __init__(self, hadoop, **kwargs):
        #self.con = file_write.Connection()
        #self.con.connect(hadoop.ip, hadoop.port)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        #self.seq = file_write.SeqFileSaver(self.con, '/common/crawler/%s/' % hadoop.username.replace(".", "/"),
        #                                   1, '%s' % hadoop.username.replace(".", "_"))
        self.encoding = 'utf-8'
        self.fields_to_export = None
        self.export_empty_fields = False
        self.writer = SeqWriter(os.path.join(Utils.settings['SEQFILE_DIR'], hadoop.username.replace(".", "/")),
                                hadoop.username.replace(".", "_"))

    def close_file(self):
        print "close"
        self.writer.close()
        #self.seq.set_is_end()
        #self.con.close()

    def start_exporting(self):
        pass

    def finish_exporting(self):
        pass

    def export_item(self, item):
        value = self.encoder.encode(dict(self._get_serialized_fields(item)))
        self.writer.writeData(
            item['key'] if 'key' in item else item['url'],
            value
        )
예제 #21
0
class SQLitePipeline(object):
    """Pushes serialized item into a SQLite table"""
    def __init__(self, conn):
        self.conn = conn
        self.encoder = ScrapyJSONEncoder()
        #TODO: ensure table exists
        # c.execute('CREATE TABLE ? (data TEXT NULL)', (table,)

    @classmethod
    def from_crawler(cls, crawler):
        conn = connection.from_crawler(crawler)
        return cls(conn)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        table = self.item_table(item, spider)
        data = self.encoder.encode(item)
        self.conn.execute('INSERT INTO "%s" VALUES (?)' % table, (data, ))
        return item

    def items_table(self, item, spider):
        """Returns SQLite table name based on given spider"""
        return "%s_items" % spider.name
예제 #22
0
class RedisStoragePipeline(object):
    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        data = self.encoder.encode(item)
        if isinstance(item, GubaPostListItem):
            key = self.item_key_list(item, spider)
        if isinstance(item, GubaPostDetailItem):
            key = self.item_key_detail(item, spider)
        self.server.rpush(key, data)

        return item

    def item_key_list(self, item, spider):
        stock_id = item['stock_id']
        return "%s:list_items" % stock_id

    def item_key_detail(self, item, spider):
        stock_id = item['stock_id']
        return "%s:detail_items" % stock_id
예제 #23
0
class RestApiExporter(BaseItemExporter):

    def __init__(self, api_url, api_key, **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        self.api_url = api_url
        self.headers = {"x-api-key": api_key}
        self._kwargs.setdefault("ensure_ascii", not self.encoding)
        self.encoder = ScrapyJSONEncoder(**self._kwargs)

    def start_exporting(self):
        logger.debug(f"Start exporting to {self.api_url}")

    def finish_exporting(self):
        logger.debug(f"Done exporting")

    def export_item(self, item):
        item_dict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(item_dict) + "\n"
        logger.debug(f"PUT {self.api_url} - {data}")
        try:
            r = requests.put(url=self.api_url, headers=self.headers, data=data)
            logger.debug(f"Response: {r.headers}")
            r.raise_for_status()
        except requests.exceptions.HTTPError as err:
            logger.error(err)
예제 #24
0
class RedisPipeline(object):

    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        return "%s:items" % spider.name
예제 #25
0
class RedisStoragePipeline(object):
    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        data = self.encoder.encode(item)
        if isinstance(item, GubaPostListItem):
            key = self.item_key_list(item, spider)
        if isinstance(item, GubaPostDetailItem):
            key = self.item_key_detail(item, spider)
        self.server.rpush(key, data)

        return item

    def item_key_list(self, item, spider):
        stock_id = item['stock_id']
        return "%s:list_items" % stock_id

    def item_key_detail(self, item, spider):
        stock_id = item['stock_id']
        return "%s:detail_items" % stock_id
예제 #26
0
파일: pipelines.py 프로젝트: stipid/ecolect
class RedisPipeline(object):
    """Pushes serialized item into a redis list/queue"""
    def __init__(self, server):
        self.server = server
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings(settings)
        return cls(server)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        key = self.item_key(item, spider)
        data = self.encoder.encode(item)
        self.server.rpush(key, data)
        return item

    def item_key(self, item, spider):
        """Returns redis key based on given spider"""
        return "%s:items" % spider.name
예제 #27
0
class KafkaPipeline:

    stats_name = 'KafkaPipeline'

    def __init__(self, settings, stats):
        from pykafka.client import KafkaClient
        self.stats = stats
        self.settings = settings
        self.encoder = ScrapyJSONEncoder()
        self.kafka = KafkaClient(hosts=self.settings.get('KAFKA_HOST') + ":" +
                                 str(self.settings.get('KAFKA_PORT')))
        self.producer = self.kafka.topics[
            self.settings['KAFKA_TOPIC']].get_sync_producer(
                min_queued_messages=1)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings, crawler.stats)

    def process_item(self, item, spider):
        itemval = item if isinstance(item, dict) else dict(item)
        itemval['spider'] = spider.name
        msg = self.encoder.encode(itemval)
        self.producer.produce(msg)
        self.stats.inc_value('{}/produce'.format(self.stats_name),
                             spider=spider)
        logger.msg("Item sent to Kafka", logger.DEBUG)
        return itemval
예제 #28
0
파일: pipelines.py 프로젝트: jinuoA/zzh_v3
class KafkaPipeline(object):
    """
    Publishes a serialized item into a Kafka topic
    :param producer: The Kafka producer
    :type producer: kafka.producer.Producer
    :param topic: The Kafka topic being used
    :type topic: str or unicode
    """
    def __init__(self, producer, topic):
        """
        :type producer: kafka.producer.Producer
        :type topic: str or unicode
        """
        self.producer = producer
        self.topic = topic
        self.encoder = ScrapyJSONEncoder()
        self.tmp_list = []
        self.time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        self.agent_ip_port = self.getHostIp()

    def process_item(self, item, spider):
        """
        Overriden method to process the item
        :param item: Item being passed
        :type item: scrapy.item.Item
        :param spider: The current spider being used
        :type spider: scrapy.spider.Spider
        """
        item = dict(item)
        item['time_str'] = self.time_str
        item['agent_ip_port'] = self.getHostIp()
        item_title = item['item_title']
        # logger.info(item_title)
        if item_title:
            print(item_title)
            msg = self.encoder.encode(item)
            # msg = msg.encode('utf-8')
            # self.producer.send_messages(self.topic, msg)

    @classmethod
    def from_settings(cls, settings):
        """
        :param settings: the current Scrapy settings
        :type settings: scrapy.settings.Settings
        :rtype: A :class:`~KafkaPipeline` instance
        """
        k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', '127.0.0.1:9092')
        topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'data-topic')
        client = SimpleClient(k_hosts)
        producer = SimpleProducer(client)
        return cls(producer, topic)

    def getHostIp(self):
        try:
            hostName = socket.getfqdn(socket.gethostname())
            hostAddr = socket.gethostbyname(hostName)
        finally:
            pass
        return hostAddr
class RabbitMQItemPublisherPipeline(object):
    def __init__(self, host, port, user, password, virtual_host, exchange,
                 routing_key, queue):
        self.host = host
        self.port = port
        self.user = user
        self.password = password
        self.virtual_host = virtual_host
        credentials = pika.PlainCredentials(self.user, self.password)
        parameters = pika.ConnectionParameters(self.host, self.port,
                                               self.virtual_host, credentials)
        # Connecting to RabbitMQ
        self.connection = pika.BlockingConnection(parameters=parameters)
        self.channel = self.connection.channel()
        self.exchange = exchange
        self.routing_key = routing_key
        self.queue = queue
        # Declaring RabbitMQ exchange
        self.channel.exchange_declare(exchange=exchange,
                                      exchange_type="direct",
                                      durable=True)
        # Decaling RabbitMQ queue
        self.channel.queue_declare(queue=queue, durable=True)
        # Binding exchange + routing_key = queue
        self.channel.queue_bind(exchange=exchange,
                                routing_key=routing_key,
                                queue=queue)
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_crawler(cls, crawler):
        # Creating a RabbitMQItemPublisherPipeline
        return cls(
            host=crawler.settings.get("RABBITMQ_HOST"),
            port=crawler.settings.get("RABBITMQ_PORT"),
            user=crawler.settings.get("RABBITMQ_USER"),
            password=crawler.settings.get("RABBITMQ_PASSWORD"),
            virtual_host=crawler.settings.get("RABBITMQ_VIRTUAL_HOST"),
            exchange=crawler.settings.get("RABBITMQ_EXCHANGE"),
            routing_key=crawler.settings.get("RABBITMQ_ROUTING_KEY"),
            queue=crawler.settings.get("RABBITMQ_QUEUE"),
        )

    def close_spider(self, spider):
        # Closing RabbitMQ channel and connection
        self.channel.close()
        self.connection.close()

    def process_item(self, item, spider):
        # Encoding item dict using Scrapy JSON Encoder
        data = self.encoder.encode(item)
        # Publishing item to exchange + routing_key = queue
        self.channel.basic_publish(
            exchange=self.exchange,
            routing_key=self.routing_key,
            body=data,
        )
        # Returning item to be processed
        return item
예제 #30
0
    def process_item(self, item, spider):
        url = "http://localhost:9200/articles/%s" % (item["publication"].lower())
        encoder = ScrapyJSONEncoder()
        json_body = encoder.encode(item)
        resp = requests.post(url, data=json_body)
        log.msg("Item added to elasticSearch node. Response: " + resp.text)

        return item
예제 #31
0
파일: exporters.py 프로젝트: voith/scrapy
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(to_bytes(self.encoder.encode(itemdict) + "\n"))
예제 #32
0
 def _persist_stats(self, stats, spider):
     encoder = ScrapyJSONEncoder()
     with open("stats.json", "w") as file:
         data = encoder.encode(stats)
         #simplejson.dump(data, file, indent=4)
         file.write(
             simplejson.dumps(simplejson.loads(data),
                              indent=4,
                              sort_keys=True))
예제 #33
0
파일: __init__.py 프로젝트: xacprod/ve1
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(self.encoder.encode(itemdict) + '\n')
예제 #34
0
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))
예제 #35
0
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        self.file = file
        self._kwargs.setdefault("ensure_ascii", not self.encoding)
        self.encoder = ScrapyJSONEncoder(**self._kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + "\n"
        self.file.write(to_bytes(data, self.encoding))
예제 #36
0
class LetMeShopApiExporter(BaseItemExporter):
    api_end_point = ''
    method = 'POST'

    def __init__(self, api_base_url, auth_token, *args, **kwargs):
        super(LetMeShopApiExporter, self).__init__(*args, export_empty_fields=True, **kwargs)
        self.api_base_url = api_base_url
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.headers = {'Authorization': 'Token %s' % auth_token}

    def _fill_missing_fields(self, item, default_value=None):
        if self.fields_to_export is None:
            missing_keys = frozenset(item.fields.iterkeys()).difference(item.iterkeys())
        else:
            missing_keys = frozenset(self.fields_to_export).difference(item.iterkeys())

        for missing_key in missing_keys:
            item[missing_key] = item.fields[missing_key].get('default_value', default_value)

        return item

    def _get_serialized_fields(self, item, default_value=None, include_empty=None):
        if include_empty is None:
            include_empty = self.export_empty_fields

        if include_empty:
            item = self._fill_missing_fields(item, default_value)

        return super(LetMeShopApiExporter, self)._get_serialized_fields(item, default_value, include_empty)

    @property
    def request_url(self):
        return urljoin(self.api_base_url, self.api_end_point)

    def export_item(self, item_or_items):
        if isinstance(item_or_items, (list, tuple)):
            item_list = item_or_items
            serialized = [dict(self._get_serialized_fields(item)) for item in item_list]
        else:
            item = item_or_items
            serialized = dict(self._get_serialized_fields(item))

        serialized = snake_case_to_camel_case(serialized)
        payload = self.encoder.encode(serialized)

        r = requests.request(self.method, self.request_url, data=payload, headers=self.headers)
        r.raise_for_status()

    def start_exporting(self):
        pass

    def finish_exporting(self):
        pass
예제 #37
0
class JsonLinesItemExporter(BaseItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))
예제 #38
0
class KafkaPipeline(object):

    """
    Publishes a serialized item into a Kafka topic

    :param producer: The Kafka producer
    :type producer: kafka.producer.Producer

    :param topic: The Kafka topic being used
    :type topic: str or unicode

    """

    def __init__(self, producer, topic):
        """
        :type producer: kafka.producer.Producer
        :type topic: str or unicode
        """
        self.producer = producer
        self.topic = topic

        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        """
        Overriden method to process the item

        :param item: Item being passed
        :type item: scrapy.item.Item

        :param spider: The current spider being used
        :type spider: scrapy.spider.Spider
        """
        # put spider name in item
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)
        self.producer.send_messages(self.topic, msg)

    @classmethod
    def from_settings(cls, settings):
        """
        :param settings: the current Scrapy settings
        :type settings: scrapy.settings.Settings

        :rtype: A :class:`~KafkaPipeline` instance
        """
        k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'scrapy_kafka_item')
        kafka = KafkaClient(k_hosts)
        conn = SimpleProducer(kafka)
        return cls(conn, topic)
예제 #39
0
class KafkaPipeline(object):
    """
    Publishes a serialized item into a Kafka topic

    :param producer: The Kafka producer
    :type producer: kafka.producer.Producer

    :param topic: The Kafka topic being used
    :type topic: str or unicode

    """
    def __init__(self, producer, topic):
        """
        :type producer: kafka.producer.Producer
        :type topic: str or unicode
        """
        self.producer = producer
        self.topic = topic

        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        """
        Overriden method to process the item

        :param item: Item being passed
        :type item: scrapy.item.Item

        :param spider: The current spider being used
        :type spider: scrapy.spider.Spider
        """
        # put spider name in item
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)
        self.producer.send_messages(self.topic, msg)

    @classmethod
    def from_settings(cls, settings):
        """
        :param settings: the current Scrapy settings
        :type settings: scrapy.settings.Settings

        :rtype: A :class:`~KafkaPipeline` instance
        """
        k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC',
                             'scrapy_kafka_item')
        kafka = KafkaClient(k_hosts)
        conn = SimpleProducer(kafka)
        return cls(conn, topic)
예제 #40
0
파일: all.py 프로젝트: kent013/football
    def insert_feed_job(self, session, job_type, feed, offset):
        job_count = session.query(CrawlerJobs).filter(CrawlerJobs.type == job_type, CrawlerJobs.feed_id == feed.id, CrawlerJobs.started_at == None).count()

        if job_count:
            return

        job = self.clone_job()
        job.type = job_type
        job.priority = 100
        job.target = feed.feed_url
        job.feed_id = feed.id
        encoder = ScrapyJSONEncoder()
        job.instruction = encoder.encode({'feed_id': feed.id, 'offset': offset})
        session.add(job)
class UnicodeJsonLinesItemExporter(BaseItemExporter):
    """ Allows exporting to JSON directly as Unicode. """
    def __init__(self, file, **kwargs):
        self._configure(kwargs)
        self.file = file
        kwargs["ensure_ascii"] = False
        self.encoder = ScrapyJSONEncoder(**kwargs)

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(self.encoder.encode(itemdict) + u"\n")

    def serialize_field(self, field, name, value):
        return value # DON'T call super version, this encodes the Unicode.
예제 #42
0
파일: exporters.py 프로젝트: bf96163/scrapy
class JsonLinesItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        self.file = file
        self._kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(
            **self._kwargs)  #就是处理各种数据类型的 比如时间 集合这种json自己处理不了的

    def export_item(self, item):
        #简单解释 将选出的项用serializer序列化后变成dict
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        #写到文件里
        self.file.write(to_bytes(data, self.encoding))
예제 #43
0
class JsonLinesItemSplitFileExporter(BaseItemExporter):

    """An item exporter to organize json lines into separate folders.

    Attributes:
        _configure (func): Uses to configure the Item Exporter by setting the options dictionary.
        encoder (ScrapyJSONEncoder): Encoder used to convert scrapy items into a json format line.

    """

    def __init__(self, **kwargs):
        """Initialize the configuration dictionary and encoder.

        Args:
            **kwargs: Arbitrary keyword arguments for the options dictionary.
        """
        # If dont_fail is set, it won't raise an exception on unexpected options
        self._configure(kwargs, dont_fail=True)
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder()
        super(JsonLinesItemSplitFileExporter, self).__init__()

    def export_item(self, item):
        """Export Scrapy items to specific files based on the article_type.

        Args:
            item (scrapy.Item): A Scrapy item that contains a complete scraped information for an article/product.

        """
        # Serialize the item, and perform encoding to create a python dictionary
        item_dict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(item_dict) + os.linesep

        # If there is only one item in article_type, then the path (folders) would just be
        # scraped_data/spider.name/article_type. Otherwise we would combine all the article_type list except the last
        # item into a path, such as scraped_data/spider.name/article_type[0]/article_type[1], then the item would be
        # a json line placed in scraped_data/spider.name/article_type[0]/article_type[1]/article_type[2].jl.
        if len(item['article_type']) == 1:
            path = os.path.join("scraped_data", item["spider_name"])
            item_path = os.path.join(path, item['article_type'][0]) + ".jl"
        else:
            path = os.path.join(os.path.join("scraped_data", item["spider_name"]),
                                (os.path.join(*item['article_type'][:-1])))
            item_path = os.path.join(path, item['article_type'][-1]) + ".jl"
        if not os.path.exists(path):
            os.makedirs(path)

        # Write in append and byte mode
        open(item_path, 'a+b').write(to_bytes(data, self.encoding))
예제 #44
0
async def get_quotes(request, modo, orden, query):
    # CORS
    request.setHeader('Access-Control-Allow-Origin', '*')
    request.setHeader('Access-Control-Allow-Methods', 'GET')
    request.setHeader('Access-Control-Allow-Headers',
                      'x-prototype-version,x-requested-with')
    request.setHeader('Access-Control-Max-Age', "2520")

    runner = SpiderRunner()
    output_data = []
    filtros = None
    rango_minimo, rango_maximo = 0, math.inf
    sitios_a_buscar = SITES_TO_SEARCH

    if b"filtro" in request.args:
        filtros = [filtro.decode("utf-8")
                   for filtro in request.args[b"filtro"]]

    if b"rango" in request.args:
        rango_minimo, rango_maximo = request.args[b"rango"]
        rango_minimo = int(rango_minimo.decode("utf-8"))
        rango_maximo = int(rango_maximo.decode("utf-8"))
        rango_maximo = math.inf if rango_maximo == -1 else rango_maximo

    if filtros is not None:
        sitios_a_buscar = [
            sitio for sitio in sitios_a_buscar if sitio not in filtros]

    _encoder = ScrapyJSONEncoder(ensure_ascii=True)
    for site in sitios_a_buscar:
        if site == "steampowered":
            results = await runner.crawl(SteamSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo))
        elif site == "nuuvem":
            results = await runner.crawl(NuuvemSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo))
        elif site == "gog":
            results = await runner.crawl(GOGSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo))
        elif site == "gamesplanet":
            results = await runner.crawl(GamesPlantetSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo))
        output = return_spider_output(results, output_data, site)
        output_data = output
    if orden != "default":
        if orden != "relevancia":
            tipo_orden, indice_orden = orden.split("_")
        else: 
            tipo_orden = orden
            indice_orden = ""
        output_data = sort_data(output_data, tipo_orden, indice_orden, query)

    return _encoder.encode(output_data)
class KafkaPipeline(object):

    def __init__(self, producer, topic):
        self.producer = producer
        self.topic = topic
        self.encoder = ScrapyJSONEncoder()

    def process_item(self, item, spider):
        item = dict(item)
        item['spider'] = spider.name
        msg = self.encoder.encode(item)
        self.producer.send_message(self.topic, msg)

    @classmethod
    def from_settings(cls, settings)
class SortedJsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        self.file = file
        self._kwargs.setdefault('indent', 4)
        self._kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**self._kwargs)
        self.items = []

    def export_item(self, item):
        self.items.append(dict(self._get_serialized_fields(item)))

    def finish_exporting(self):
        data = self.encoder.encode(sorted(self.items, key=sort_key))
        self.file.write(to_bytes(data, self.encoding))
예제 #47
0
class AddItemPipeline(object):
    """ Pushes serialized item into a RQ """

    def __init__(self, host, port, db, queue_name, store_id):
        self.encoder = ScrapyJSONEncoder()
        self.store_id = store_id
        self.queue_name = queue_name

        self.server = redis.Redis(host, port, db)
        self.queue = rq.Queue(queue_name, connection=self.server)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.get('REDIS_PORT', 6379)
        db = settings.get('REDIS_DB', 0)
        queue_name = settings.get('RQ_QUEUE', 'default')
        store_id = int(settings.get('STORE', 0))
        return cls(host, port, db, queue_name, store_id)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def process_item(self, item, spider):
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):
        ''' ''' # {{{

        ## get global Store URL mapping
        store_id = self.store_id
        if store_id is 0:
            raise DropItem('Not set the store and no Store URL mapping')

        ## assign queue parameters
        item['store_id'] = store_id
        callback = 'worker.save_product_to_db'
        event = self.encoder.encode(dict(queue=self.queue_name, value=item, time=time.time()))

        ## push item to redis queue
        self.queue.enqueue(callback, event)
        
        return item
예제 #48
0
class SendToBrokerPipeline(object):

	def __init__(self):
		self.publisher = Publisher('data_distributor')
		self.encoder = ScrapyJSONEncoder()
	

	def process_item(self, item, spider):
		#Runs sending broker in separate thread to prevent it from blocking
		#on single items
		return deferToThread(self._process_item, item, spider)

	def _process_item(self, item, spider):

		item_dict = dict(item)

		data = self.encoder.encode(item_dict)
		self.publisher.send_message(data,'articles')
		return item
예제 #49
0
파일: exporters.py 프로젝트: voith/scrapy
class JsonItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b"[")

    def finish_exporting(self):
        self.file.write(b"]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b",\n")
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(to_bytes(self.encoder.encode(itemdict)))
예제 #50
0
class JsonItemExporter(JsonLinesItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs)
        self.file = file
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write("[")

    def finish_exporting(self):
        self.file.write("]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(',\n')
        itemdict = dict(self._get_serialized_fields(item))
        self.file.write(self.encoder.encode(itemdict))
    def parse(self, response):

        items = []
        for row in response.xpath('//table[@class="TblDataRecs"]/tr'):
            item = mylanguageexchangeItem()
            name = row.xpath('td[@class="userdata"]//a//b/text()').extract()
            item["name"] = [x.strip() for x in name]
            country = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[1]/td/text()').extract()
            item["country"] = [x.strip() for x in country]
            city = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[2]/td/text()').extract()
            item["city"] = [x.strip().strip('()') for x in city]
            native = row.xpath('td[@class="userdata"][@data-th="Native Language"]//td/text()').extract()
            item["native"] = [x.strip() for x in native]
            practicing = row.xpath('td[@class="userdata"][@data-th="Practicing Language"]//td/text()').extract()
            item["practicing"] = [x.strip() for x in practicing]
            desc = row.xpath('td[@class="userdata"][@data-th="Description"]//td/text()').extract()
            item["desc"] = [x.strip() for x in desc]
            items.append(item)

        _encoder = ScrapyJSONEncoder()
        with open('mylanguageexchange_crawled.json', 'w') as outfile:
            outfile.write(_encoder.encode(items))
예제 #52
0
파일: exporters.py 프로젝트: 0daybug/scrapy
class JsonItemExporter(BaseItemExporter):

    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.file = file
        kwargs.setdefault('ensure_ascii', not self.encoding)
        self.encoder = ScrapyJSONEncoder(**kwargs)
        self.first_item = True

    def start_exporting(self):
        self.file.write(b"[\n")

    def finish_exporting(self):
        self.file.write(b"\n]")

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',\n')
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict)
        self.file.write(to_bytes(data, self.encoding))
예제 #53
0
class StatusMailer(object):
    def __init__(self, recipients, mail, compressor, crawler):
        self.recipients = recipients
        self.mail = mail
        self.encoder = ScrapyJSONEncoder(crawler=crawler)
        self.files = defaultdict(compressor)

        self.num_items = 0
        self.num_errors = 0

    @classmethod
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
        compression = crawler.settings.get('STATUSMAILER_COMPRESSION')

        if not compression:
            compressor = PlainCompressor
        elif compression.lower().startswith('gz'):
            compressor = GzipCompressor
        else:
            raise NotConfigured

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, compressor, crawler)

        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(instance.request_received, signal=signals.request_received)

        return instance

    def item_scraped(self, item, response, spider):
        self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
        self.num_items += 1

    def spider_error(self, failure, response, spider):
        self.files[spider.name + '.log'].write(failure.getTraceback())
        self.num_errors += 1

    def request_received(self, request, spider):
        self.files[spider.name + '.log'].write(str(request) + '\n')

    def spider_closed(self, spider, reason):
        files = []

        for name, compressed in self.files.items():
            files.append((name + compressed.extension, compressed.mimetype, compressed))

        try:
            size = self.files[spider.name + '-items.json'].size
        except KeyError:
            size = 0

        body='''Crawl statistics:

 - Spider name: {0}
 - Spider finished at: {1}
 - Number of items scraped: {2}
 - Number of errors: {3}
 - Size of scraped items: {4}'''.format(
            spider.name,
            datetime.datetime.now(),
            self.num_items,
            self.num_errors,
            format_size(size)
        )

        return self.mail.send(
            to=self.recipients,
            subject='Crawler for %s: %s' % (spider.name, reason),
            body=body,
            attachs=files
        )
예제 #54
0
class ShopwareRESTPipeline(object):

	def __init__(self):
		"""
		Connect to Shopware REST Api using HTTP digest authentication.
		We need an ADMIN role with sufficient access to insert articles.
		Shopware4 (german) API Guide: http://wiki.shopware.de/_detail_861_487.html
		"""
		self.name         = settings['SHOPWARE_SERVICE_NAME']
		self.api_url      = settings['SHOPWARE_API_BASE']
		self.access_token = settings['SHOPWARE_TOKEN_KEY']

		self.request_headers = {'Content-Type': 'application/json; charset=utf-8', 'Accept': 'application/json'}
		self.encoder = ScrapyJSONEncoder()

		self.node = {}

		# shopware minimal default item
		self.default_item = RestItem({
			'taxId': 1,
			#'tax': 19,
			'name': 'nexus',
			'mainDetail': {
				'number': 'nex24',
				'prices': [{
					'customerGroupKey': 'EK',
					'basePrice': 16,
					'price': 20, # shop will add VAT (if configured that way)
				}],
			#	'attribute': {
			#		'supplier_url': 'http://example.net',
			#		'supplierUrl': 'http://example.net',
			#	#	'attr19': 'http://example.net',
			#	},
			},
			'active': True,
			'supplier': 'example com',
			'categories': [
				{'id': 5,},
				{'id': 3,},
			],
			'images': [{
				#'id': '1', ## this one is bugged in shopware (doesnt add image to article)
				#'mediaId': '1',
				# needs deduplication on update
				'link': 'http://shopware.local/templates/_emotion/frontend/_resources/images/logo.jpg',
			}],
			'attribute': {
				'attr19': 'http://example.net',
			},
			'description': 'Some Article',
			'descriptionLong': 'Some Article Description',
		})

	def open_spider(self, spider):
		# TODO: test with async / multiple spiders open
		# (attach session to spider? self.session[spider])
#		token = self.access_token, self.access_token_secret
#		self.session = self.oauth.get_session(token=token)
		self.requests = requests.Session()
		self.requests.auth=HTTPDigestAuth(self.name, self.access_token)

		# category mapping
		self.node[spider] = {}
		self.root = 4 # our! root category # self.SHOPWARE_ROOT_NODE
		r = self.requests.get(
			'%s/categories' % (self.api_url),
			headers=self.request_headers,
			allow_redirects=False,
		)
		if r.status_code != 200:
			raise CloseSpider('API not available')
		resp = json.loads(r.content)
		if resp['success'] != True:
			raise CloseSpider('API returned failure')

		ident = spider.name
		if spider.ident:
			ident = spider.ident

		for cat in resp['data']:
			if cat['parentId'] != self.root:
				continue
			if cat['name'] == ident:
				self.node[spider] = cat

		if 'name' not in self.node[spider] or ident not in self.node[spider]['name']:
			# create spider category node
		#	description = '[SCRAPER LOG] Category created at: %s' % datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
			payload = {'name': ident, 'active': False, 'parentId': self.root}
			r = self.requests.post('%s/categories' % (self.api_url), headers=self.request_headers, data=json.dumps(payload))
			resp = json.loads(r.content)
			r = self.requests.get(resp['data']['location'], headers=self.request_headers)
			resp = json.loads(r.content)
			self.node[spider] = resp['data']

#		print self.node[spider]['id']

#		# starting run...
#		description = '[SCRAPER LOG] Starting crawler run at: %s\n' % datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
#		description += self.api.catalog_category.info(self.node[spider].get('category_id'), 0, ['description']).get('description')
#		self.api.catalog_category.update(self.node[spider].get('category_id'), {'description': description, 'available_sort_by': 'name'})

	def close_spider(self, spider):
		# TODO: test with async / multiple spiders open
	#	self.session.close()

		# finished run
	#	description = self.api.catalog_category.info(self.node[spider].get('category_id'), 0, ['description']).get('description')
	#	description += '\n[SCRAPER] Finished crawler run at: %s' % datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
	#	#description += '\n[SCRAPER] -- Run statistics:\n%s' % crawler_output
	#	self.api.catalog_category.update(self.node[spider].get('category_id'), {'description': description, 'available_sort_by': 'name'})

		if self.node[spider]:
			del self.node[spider]

#	def create_item(self, item, spider):
#	def update_item(self, item, spider):

	def process_item(self, item, spider):
		"""
		Push scraped items into Shopware.
		Do not go over CsvItemExporter,
		do not collect direct-sql speedups.
		"""
		# try insert first. if it fails because it exists (check error msg), do an update on :id
		# (todo) try to extend magento api to allow 'upserts' (POST create _or_ update if exists)
		itemdict = dict(self._get_serialized_fields(self.default_item))
#		itemdict = dict(self._get_serialized_fields(dict(item.items() + self.default_item.items())))
		payload = self.encoder.encode(itemdict)
		print payload

		# set proxies, user-agent?
		try:
			log.msg('API CALL: updating product', spider=spider, level=log.DEBUG)
			r = self.requests.put(
				'%s/articles/%s?useNumberAsId=1' % (self.api_url, itemdict['mainDetail']['number']), # API Endpoint
				headers=self.request_headers,
			#	proxies=proxies,
				allow_redirects=False,
				data=payload,
			)

			resp = r.content
			if r.status_code != 302:
				message = r.status_code
				resp = json.loads(r.content)
				if resp:
					message = resp['message']
				else:
					message = r.content
				raise Fault('{0}: Update failed: {1}'.format(self.__class__.__name__, message))

			log.msg('{0}: Product updated: {1}: {2}'.format(self.__class__.__name__, itemdict['mainDetail']['number'], resp), spider=spider, level=log.INFO)
		except Fault as fault:
			#if fault.faultCode == 620: # wrong
			#	log.msg(fault.faultString, spider=spider, level=log.DEBUG)
			# Update failed: Article by id nex21 not found
			log.msg('API CALL: inserting product', spider=spider, level=log.DEBUG)
			r = self.requests.post(
				'%s/articles?useNumberAsId=1' % (self.api_url), # API Endpoint
				headers=self.request_headers,
			#	proxies=proxies,
				allow_redirects=False,
				data=payload,
			)

			if r.status_code != 201 or json.loads(r.content) == None:
				message = r.status_code

			resp = r.content
			message = 'unknown error'
			resp = json.loads(r.content)
			#if r.status_code == 201:
			if resp['success'] == True:
				data = json.loads(r.content)
				message = resp['data']
			if resp['success'] == False:
				message = resp['message']
				# insert error checking here, Exception on errors

			log.msg('{0}: Product inserted: "{1}" as id {2} at {3}'.format(self.__class__.__name__, itemdict['mainDetail']['number'], resp['data']['id'], resp['data']['location'] ), spider=spider, level=log.INFO)

		# non-blocking async version
	#	f = self.requests.post
	#	d = threads.deferToThread(f,
	#		'%s/articles' % self.api_url, # API Endpoint
	#		headers=self.request_headers,
	#		allow_redirects=False,
	#		data=payload,
	#	) # (function, *args, **kwargs)
	#	d.addCallback(self.persist_test, info)
	#	d.addErrback(log.err, self.__class__.__name__ + '.image_downloaded')

		#r = requests.head(url)
		# messages:
		# {"success":true,"data":{"id":2,"location":"http:\/\/shopware.local\/api\/articles\/2"}}
		# {"success":false,"message":"Resource not found"}
		# {"success":false,"message":"Validation error","errors":["tax: This value should not be blank","mainDetail.number: This value should not be blank"]}
		# {"success":false,"message":"Tax by taxrate 1 not found"}
		# {"success":false,"message":"Errormesage: SQLSTATE[23000]: Integrity constraint violation: 1062 Duplicate entry 'sku' for key 'ordernumber'"}
		# -> retry as PUT (update)
		# {"success":false,"message":"Customer Group by key  not found"}

		return item

	export_empty_fields = False
	fields_to_export = None
	encoding = 'utf-8'

	def _get_serialized_fields(self, item, default_value=None, include_empty=None):
		"""Return the fields to export as an iterable of tuples (name,
		serialized_value)
		"""
		if include_empty is None:
			include_empty = self.export_empty_fields
		if self.fields_to_export is None:
			if include_empty:
				field_iter = item.fields.iterkeys()
			else:
				field_iter = item.iterkeys()
		else:
			if include_empty:
				field_iter = self.fields_to_export
			else:
				nonempty_fields = set(item.keys())
				field_iter = (x for x in self.fields_to_export if x in nonempty_fields)
		for field_name in field_iter:
			if field_name in item:
				field = item.fields[field_name]
				value = self.serialize_field(field, field_name, item[field_name])
			else:
				value = default_value

			yield field_name, value

	def serialize_field(self, field, name, value):
		serializer = field.get('serializer', self._to_str_if_unicode)
		return serializer(value)

	def _to_str_if_unicode(self, value):
		return value.encode(self.encoding) if isinstance(value, unicode) else value