class JsonEncoderTestCase(unittest.TestCase): def setUp(self): self.encoder = ScrapyJSONEncoder() def test_encode_decode(self): dt = datetime.datetime(2010, 1, 2, 10, 11, 12) dts = "2010-01-02 10:11:12" d = datetime.date(2010, 1, 2) ds = "2010-01-02" t = datetime.time(10, 11, 12) ts = "10:11:12" dec = Decimal("1000.12") decs = "1000.12" for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts), (dec, decs), (['foo', d], ['foo', ds])]: self.assertEqual(self.encoder.encode(input), json.dumps(output)) def test_encode_deferred(self): self.assertIn('Deferred', self.encoder.encode(defer.Deferred())) def test_encode_request(self): r = Request("http://www.example.com/lala") rs = self.encoder.encode(r) self.assertIn(r.method, rs) self.assertIn(r.url, rs) def test_encode_response(self): r = Response("http://www.example.com/lala") rs = self.encoder.encode(r) self.assertIn(r.url, rs) self.assertIn(str(r.status), rs)
class KafkaPipeline(object): def __init__(self, producer, topic): self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): item = dict(item) item['spider'] = spider.name msg = self.encoder.encode(item) # Key is null, value is JSON object. #(null, { #"category": "international", #"title": "Its Charter Expired, Export-Import Bank Will Keep the Doors Open", #"author": "By JACKIE CALMES", #"spider": "NYtimes", #"link": "http://www.nytimes.com/2015/07/01/business/international/though-charter-is-expiring-export-import-bank-will-keep-its-doors-open.html", #"date": "June 30, 2015", #"article": ["Advertisemen.."] self.producer.send(self.topic, msg) @classmethod def from_settings(cls, settings): kafka_hosts = settings.get('SCRAPY_KAFKA_HOSTS') topic = settings['SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC'] producer = KafkaProducer(bootstrap_servers=kafka_hosts) return cls(producer, topic)
class RedisPipeline(object): def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) ip_key = self.item_ip_key(item, spider) data = self.encoder.encode(item) self.server.lpush(key, data) self.server.lpush(ip_key, data) return item def item_key(self, item, spider): return "%s:items:all" % spider.name def item_ip_key(self, item, spider): return "%s:items:%s" % (spider.name, item['ip'])
class RedisPipeline(object): """Pushes serialized item into a redis list/queue""" def __init__(self, host, port, queue_type): self.server = redis.Redis(host, port) self.encoder = ScrapyJSONEncoder() self.queue_type = queue_type @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) queue_type = settings.get('QUEUE_TYPE', 'FIFO') return cls(host, port) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(dict(item)) self.server.lpush(key, data) if (self.queue_type == 'LIFO'): self.server.lpush(key, data) else: self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns redis key based on given spider""" return "%s:items" % spider.name
class RedisPipeline(object): """Pushes serialized item into a redis list/queue""" def __init__(self, host, port): self.server = redis.Redis(host, port) self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) return cls(host, port) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(dict(item)) self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns redis key based on given spider""" return "%s:items" % spider.name
class DockerhubExtension(object): @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.crawler = crawler self.job_path = crawler.settings.get('JOB_PATH') if not self.job_path: raise NotConfigured('no JOB_PATH set') self.json_encoder = ScrapyJSONEncoder() self.looping_call = LoopingCall(self.store_job_info) self.looping_call.start(5) crawler.signals.connect(self.store_job_info, signal=signals.spider_closed) def store_job_info(self): with open(self.job_path, 'w') as f: stats = self.crawler.stats.get_stats() job_info = { 'stats': stats } job_info_json = self.json_encoder.encode(job_info) f.write(job_info_json)
class RedisPipeline(object): """Pushes serialized item into a redis list/queue""" def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns redis key based on given spider""" return "%s:items" % spider.name
class RabbitMQPipeline(object): """Pushes serialized item into a RabbitMQ list/queue""" def __init__(self, server, exchange_name): self.server = server self.exchange_name = exchange_name self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server, redis_server = connection.from_settings(settings) exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME) return cls(server, exchange_name) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.basic_publish(exchange=self.exchange_name, routing_key=key, body=data) return item def item_key(self, item, spider): """Returns RabbitMQ key based on given spider""" return "%s:items" % spider.name
class RabbitMQItemPublisherPipeline(object): def __init__(self, connect_url, exchange_name, routing_key, queue_name): self.connect_url = connect_url self.connection = RabbitMQConnection(connect_url, exchange_name=exchange_name, routing_key=routing_key, queue_name=queue_name) self.encoder = ScrapyJSONEncoder() @classmethod def from_crawler(cls, crawler): return cls( connect_url=crawler.settings.get("RABBITMQ_RESULT_URI"), exchange_name=crawler.settings.get("RABBITMQ_RESULT_EXCHANGE"), routing_key=crawler.settings.get("RABBITMQ_RESULT_ROUTING_KEY"), queue_name=crawler.settings.get("RABBITMQ_RESULT_QUEUE"), ) def close_spider(self, spider): self.connection.close() def process_item(self, item, spider): data = self.encoder.encode(item) self.connection.publish(body=data, headers={'model': item.get('model', None)}, routing_key=getattr(spider, 'amqp_result_routing_key', None)) return item
class RedisPipeline(object): """ Pushes serialized item into a redis. Specific for SocialSpiders """ def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.set(key, data.decode('utf-8')) return item def item_key(self, item, spider): """Returns redis key based on given spider""" return "{}_{}".format(spider.name, item['search_name'])
class ScrapyLearningPipeline(object): def __init__(self): kafka_ip_port = settings.KAFKA_IP_PORT kafka_topic = settings.KAFKA_TOPIC if len(kafka_ip_port) == 1: kafka_ip_port = kafka_ip_port[0] else: if isinstance(kafka_ip_port, list): kafka_ip_port = ",".join(kafka_ip_port) else: kafka_ip_port = kafka_ip_port self._client = KafkaClient(hosts=kafka_ip_port) self._producer = self._client.topics[kafka_topic.encode( encoding="UTF-8")].get_producer() self._encoder = ScrapyJSONEncoder() def process_item(self, item, spider): item = dict(item) item['spider'] = spider.name msg = self._encoder.encode(item) print(msg) self._producer.produce(msg.encode(encoding="UTF-8")) # self._producer.produce(item['url'].encode(encoding="UTF-8")) return item def close_spider(self, spider): self._producer.stop()
class JsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file # there is a small difference between the behaviour or JsonItemExporter.indent # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent # the addition of newlines everywhere json_indent = self.indent if self.indent is not None and self.indent > 0 else None kwargs.setdefault('indent', json_indent) kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def _beautify_newline(self): if self.indent is not None: self.file.write(b'\n') def start_exporting(self): self.file.write(b"[") self._beautify_newline() def finish_exporting(self): self._beautify_newline() self.file.write(b"]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',') self._beautify_newline() itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) self.file.write(to_bytes(data, self.encoding))
class RabbitMQPipeline(object): """Pushes serialized item into a RabbitMQ list/queue""" def __init__(self, server, exchange_name): self.server = server self.exchange_name = exchange_name self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = from_settings(settings) exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME) return cls(server, exchange_name) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.basic_publish(exchange=self.exchange_name, routing_key=key, body=data) return item def item_key(self, item, spider): """Returns RabbitMQ key based on given spider""" return "%s:items" % spider.name
class RedisPipeline(object): """Pushes serialized item into a scrapy_redis list/queue""" def __init__(self, host, port): self.server = redis.Redis(host, port) self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): host = settings.get("REDIS_HOST", "localhost") port = settings.get("REDIS_PORT", 6379) return cls(host, port) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(dict(item)) self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns scrapy_redis key based on given spider""" return "%s:items" % spider.name
class RabbitMQPipeline(object): """Pushes serialized item into a RabbitMQ list/queue""" def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.basic_publish(exchange='', routing_key=key, body=data) return item def item_key(self, item, spider): """Returns RabbitMQ key based on given spider""" return "%s:items" % spider.name
class APICallerPipeline(object): def __init__(self): pass def open_spider(self, spider): self.encoder = ScrapyJSONEncoder() def close_spider(self, spider): return True def process_item(self, item, spider): url = "http://127.0.0.1:5000/books" headers = {'Content-Type': 'application/json'} response_data = requests.request("POST", url, data=self.encoder.encode(item), headers=headers) if response_data.status_code == 201: print("The book '{}' has been saved into the database".format( item['title'])) return item def checkIfAlreadyExists(self, item): pass
class HadoopExporter(BaseItemExporter): def __init__(self, hadoop, **kwargs): #self.con = file_write.Connection() #self.con.connect(hadoop.ip, hadoop.port) self.encoder = ScrapyJSONEncoder(**kwargs) #self.seq = file_write.SeqFileSaver(self.con, '/common/crawler/%s/' % hadoop.username.replace(".", "/"), # 1, '%s' % hadoop.username.replace(".", "_")) self.encoding = 'utf-8' self.fields_to_export = None self.export_empty_fields = False self.writer = SeqWriter(os.path.join(Utils.settings['SEQFILE_DIR'], hadoop.username.replace(".", "/")), hadoop.username.replace(".", "_")) def close_file(self): print "close" self.writer.close() #self.seq.set_is_end() #self.con.close() def start_exporting(self): pass def finish_exporting(self): pass def export_item(self, item): value = self.encoder.encode(dict(self._get_serialized_fields(item))) self.writer.writeData( item['key'] if 'key' in item else item['url'], value )
class SQLitePipeline(object): """Pushes serialized item into a SQLite table""" def __init__(self, conn): self.conn = conn self.encoder = ScrapyJSONEncoder() #TODO: ensure table exists # c.execute('CREATE TABLE ? (data TEXT NULL)', (table,) @classmethod def from_crawler(cls, crawler): conn = connection.from_crawler(crawler) return cls(conn) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): table = self.item_table(item, spider) data = self.encoder.encode(item) self.conn.execute('INSERT INTO "%s" VALUES (?)' % table, (data, )) return item def items_table(self, item, spider): """Returns SQLite table name based on given spider""" return "%s_items" % spider.name
class RedisStoragePipeline(object): def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): data = self.encoder.encode(item) if isinstance(item, GubaPostListItem): key = self.item_key_list(item, spider) if isinstance(item, GubaPostDetailItem): key = self.item_key_detail(item, spider) self.server.rpush(key, data) return item def item_key_list(self, item, spider): stock_id = item['stock_id'] return "%s:list_items" % stock_id def item_key_detail(self, item, spider): stock_id = item['stock_id'] return "%s:detail_items" % stock_id
class RestApiExporter(BaseItemExporter): def __init__(self, api_url, api_key, **kwargs): super().__init__(dont_fail=True, **kwargs) self.api_url = api_url self.headers = {"x-api-key": api_key} self._kwargs.setdefault("ensure_ascii", not self.encoding) self.encoder = ScrapyJSONEncoder(**self._kwargs) def start_exporting(self): logger.debug(f"Start exporting to {self.api_url}") def finish_exporting(self): logger.debug(f"Done exporting") def export_item(self, item): item_dict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(item_dict) + "\n" logger.debug(f"PUT {self.api_url} - {data}") try: r = requests.put(url=self.api_url, headers=self.headers, data=data) logger.debug(f"Response: {r.headers}") r.raise_for_status() except requests.exceptions.HTTPError as err: logger.error(err)
class RedisPipeline(object): def __init__(self, server): self.server = server self.encoder = ScrapyJSONEncoder() @classmethod def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.encoder.encode(item) self.server.rpush(key, data) return item def item_key(self, item, spider): return "%s:items" % spider.name
class KafkaPipeline: stats_name = 'KafkaPipeline' def __init__(self, settings, stats): from pykafka.client import KafkaClient self.stats = stats self.settings = settings self.encoder = ScrapyJSONEncoder() self.kafka = KafkaClient(hosts=self.settings.get('KAFKA_HOST') + ":" + str(self.settings.get('KAFKA_PORT'))) self.producer = self.kafka.topics[ self.settings['KAFKA_TOPIC']].get_sync_producer( min_queued_messages=1) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings, crawler.stats) def process_item(self, item, spider): itemval = item if isinstance(item, dict) else dict(item) itemval['spider'] = spider.name msg = self.encoder.encode(itemval) self.producer.produce(msg) self.stats.inc_value('{}/produce'.format(self.stats_name), spider=spider) logger.msg("Item sent to Kafka", logger.DEBUG) return itemval
class KafkaPipeline(object): """ Publishes a serialized item into a Kafka topic :param producer: The Kafka producer :type producer: kafka.producer.Producer :param topic: The Kafka topic being used :type topic: str or unicode """ def __init__(self, producer, topic): """ :type producer: kafka.producer.Producer :type topic: str or unicode """ self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder() self.tmp_list = [] self.time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.agent_ip_port = self.getHostIp() def process_item(self, item, spider): """ Overriden method to process the item :param item: Item being passed :type item: scrapy.item.Item :param spider: The current spider being used :type spider: scrapy.spider.Spider """ item = dict(item) item['time_str'] = self.time_str item['agent_ip_port'] = self.getHostIp() item_title = item['item_title'] # logger.info(item_title) if item_title: print(item_title) msg = self.encoder.encode(item) # msg = msg.encode('utf-8') # self.producer.send_messages(self.topic, msg) @classmethod def from_settings(cls, settings): """ :param settings: the current Scrapy settings :type settings: scrapy.settings.Settings :rtype: A :class:`~KafkaPipeline` instance """ k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', '127.0.0.1:9092') topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'data-topic') client = SimpleClient(k_hosts) producer = SimpleProducer(client) return cls(producer, topic) def getHostIp(self): try: hostName = socket.getfqdn(socket.gethostname()) hostAddr = socket.gethostbyname(hostName) finally: pass return hostAddr
class RabbitMQItemPublisherPipeline(object): def __init__(self, host, port, user, password, virtual_host, exchange, routing_key, queue): self.host = host self.port = port self.user = user self.password = password self.virtual_host = virtual_host credentials = pika.PlainCredentials(self.user, self.password) parameters = pika.ConnectionParameters(self.host, self.port, self.virtual_host, credentials) # Connecting to RabbitMQ self.connection = pika.BlockingConnection(parameters=parameters) self.channel = self.connection.channel() self.exchange = exchange self.routing_key = routing_key self.queue = queue # Declaring RabbitMQ exchange self.channel.exchange_declare(exchange=exchange, exchange_type="direct", durable=True) # Decaling RabbitMQ queue self.channel.queue_declare(queue=queue, durable=True) # Binding exchange + routing_key = queue self.channel.queue_bind(exchange=exchange, routing_key=routing_key, queue=queue) self.encoder = ScrapyJSONEncoder() @classmethod def from_crawler(cls, crawler): # Creating a RabbitMQItemPublisherPipeline return cls( host=crawler.settings.get("RABBITMQ_HOST"), port=crawler.settings.get("RABBITMQ_PORT"), user=crawler.settings.get("RABBITMQ_USER"), password=crawler.settings.get("RABBITMQ_PASSWORD"), virtual_host=crawler.settings.get("RABBITMQ_VIRTUAL_HOST"), exchange=crawler.settings.get("RABBITMQ_EXCHANGE"), routing_key=crawler.settings.get("RABBITMQ_ROUTING_KEY"), queue=crawler.settings.get("RABBITMQ_QUEUE"), ) def close_spider(self, spider): # Closing RabbitMQ channel and connection self.channel.close() self.connection.close() def process_item(self, item, spider): # Encoding item dict using Scrapy JSON Encoder data = self.encoder.encode(item) # Publishing item to exchange + routing_key = queue self.channel.basic_publish( exchange=self.exchange, routing_key=self.routing_key, body=data, ) # Returning item to be processed return item
def process_item(self, item, spider): url = "http://localhost:9200/articles/%s" % (item["publication"].lower()) encoder = ScrapyJSONEncoder() json_body = encoder.encode(item) resp = requests.post(url, data=json_body) log.msg("Item added to elasticSearch node. Response: " + resp.text) return item
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) self.file.write(to_bytes(self.encoder.encode(itemdict) + "\n"))
def _persist_stats(self, stats, spider): encoder = ScrapyJSONEncoder() with open("stats.json", "w") as file: data = encoder.encode(stats) #simplejson.dump(data, file, indent=4) file.write( simplejson.dumps(simplejson.loads(data), indent=4, sort_keys=True))
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) self.file.write(self.encoder.encode(itemdict) + '\n')
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) + '\n' self.file.write(to_bytes(data, self.encoding))
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): super().__init__(dont_fail=True, **kwargs) self.file = file self._kwargs.setdefault("ensure_ascii", not self.encoding) self.encoder = ScrapyJSONEncoder(**self._kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) + "\n" self.file.write(to_bytes(data, self.encoding))
class LetMeShopApiExporter(BaseItemExporter): api_end_point = '' method = 'POST' def __init__(self, api_base_url, auth_token, *args, **kwargs): super(LetMeShopApiExporter, self).__init__(*args, export_empty_fields=True, **kwargs) self.api_base_url = api_base_url self.encoder = ScrapyJSONEncoder(**kwargs) self.headers = {'Authorization': 'Token %s' % auth_token} def _fill_missing_fields(self, item, default_value=None): if self.fields_to_export is None: missing_keys = frozenset(item.fields.iterkeys()).difference(item.iterkeys()) else: missing_keys = frozenset(self.fields_to_export).difference(item.iterkeys()) for missing_key in missing_keys: item[missing_key] = item.fields[missing_key].get('default_value', default_value) return item def _get_serialized_fields(self, item, default_value=None, include_empty=None): if include_empty is None: include_empty = self.export_empty_fields if include_empty: item = self._fill_missing_fields(item, default_value) return super(LetMeShopApiExporter, self)._get_serialized_fields(item, default_value, include_empty) @property def request_url(self): return urljoin(self.api_base_url, self.api_end_point) def export_item(self, item_or_items): if isinstance(item_or_items, (list, tuple)): item_list = item_or_items serialized = [dict(self._get_serialized_fields(item)) for item in item_list] else: item = item_or_items serialized = dict(self._get_serialized_fields(item)) serialized = snake_case_to_camel_case(serialized) payload = self.encoder.encode(serialized) r = requests.request(self.method, self.request_url, data=payload, headers=self.headers) r.raise_for_status() def start_exporting(self): pass def finish_exporting(self): pass
class KafkaPipeline(object): """ Publishes a serialized item into a Kafka topic :param producer: The Kafka producer :type producer: kafka.producer.Producer :param topic: The Kafka topic being used :type topic: str or unicode """ def __init__(self, producer, topic): """ :type producer: kafka.producer.Producer :type topic: str or unicode """ self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): """ Overriden method to process the item :param item: Item being passed :type item: scrapy.item.Item :param spider: The current spider being used :type spider: scrapy.spider.Spider """ # put spider name in item item = dict(item) item['spider'] = spider.name msg = self.encoder.encode(item) self.producer.send_messages(self.topic, msg) @classmethod def from_settings(cls, settings): """ :param settings: the current Scrapy settings :type settings: scrapy.settings.Settings :rtype: A :class:`~KafkaPipeline` instance """ k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'scrapy_kafka_item') kafka = KafkaClient(k_hosts) conn = SimpleProducer(kafka) return cls(conn, topic)
def insert_feed_job(self, session, job_type, feed, offset): job_count = session.query(CrawlerJobs).filter(CrawlerJobs.type == job_type, CrawlerJobs.feed_id == feed.id, CrawlerJobs.started_at == None).count() if job_count: return job = self.clone_job() job.type = job_type job.priority = 100 job.target = feed.feed_url job.feed_id = feed.id encoder = ScrapyJSONEncoder() job.instruction = encoder.encode({'feed_id': feed.id, 'offset': offset}) session.add(job)
class UnicodeJsonLinesItemExporter(BaseItemExporter): """ Allows exporting to JSON directly as Unicode. """ def __init__(self, file, **kwargs): self._configure(kwargs) self.file = file kwargs["ensure_ascii"] = False self.encoder = ScrapyJSONEncoder(**kwargs) def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) self.file.write(self.encoder.encode(itemdict) + u"\n") def serialize_field(self, field, name, value): return value # DON'T call super version, this encodes the Unicode.
class JsonLinesItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): super().__init__(dont_fail=True, **kwargs) self.file = file self._kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder( **self._kwargs) #就是处理各种数据类型的 比如时间 集合这种json自己处理不了的 def export_item(self, item): #简单解释 将选出的项用serializer序列化后变成dict itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) + '\n' #写到文件里 self.file.write(to_bytes(data, self.encoding))
class JsonLinesItemSplitFileExporter(BaseItemExporter): """An item exporter to organize json lines into separate folders. Attributes: _configure (func): Uses to configure the Item Exporter by setting the options dictionary. encoder (ScrapyJSONEncoder): Encoder used to convert scrapy items into a json format line. """ def __init__(self, **kwargs): """Initialize the configuration dictionary and encoder. Args: **kwargs: Arbitrary keyword arguments for the options dictionary. """ # If dont_fail is set, it won't raise an exception on unexpected options self._configure(kwargs, dont_fail=True) kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder() super(JsonLinesItemSplitFileExporter, self).__init__() def export_item(self, item): """Export Scrapy items to specific files based on the article_type. Args: item (scrapy.Item): A Scrapy item that contains a complete scraped information for an article/product. """ # Serialize the item, and perform encoding to create a python dictionary item_dict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(item_dict) + os.linesep # If there is only one item in article_type, then the path (folders) would just be # scraped_data/spider.name/article_type. Otherwise we would combine all the article_type list except the last # item into a path, such as scraped_data/spider.name/article_type[0]/article_type[1], then the item would be # a json line placed in scraped_data/spider.name/article_type[0]/article_type[1]/article_type[2].jl. if len(item['article_type']) == 1: path = os.path.join("scraped_data", item["spider_name"]) item_path = os.path.join(path, item['article_type'][0]) + ".jl" else: path = os.path.join(os.path.join("scraped_data", item["spider_name"]), (os.path.join(*item['article_type'][:-1]))) item_path = os.path.join(path, item['article_type'][-1]) + ".jl" if not os.path.exists(path): os.makedirs(path) # Write in append and byte mode open(item_path, 'a+b').write(to_bytes(data, self.encoding))
async def get_quotes(request, modo, orden, query): # CORS request.setHeader('Access-Control-Allow-Origin', '*') request.setHeader('Access-Control-Allow-Methods', 'GET') request.setHeader('Access-Control-Allow-Headers', 'x-prototype-version,x-requested-with') request.setHeader('Access-Control-Max-Age', "2520") runner = SpiderRunner() output_data = [] filtros = None rango_minimo, rango_maximo = 0, math.inf sitios_a_buscar = SITES_TO_SEARCH if b"filtro" in request.args: filtros = [filtro.decode("utf-8") for filtro in request.args[b"filtro"]] if b"rango" in request.args: rango_minimo, rango_maximo = request.args[b"rango"] rango_minimo = int(rango_minimo.decode("utf-8")) rango_maximo = int(rango_maximo.decode("utf-8")) rango_maximo = math.inf if rango_maximo == -1 else rango_maximo if filtros is not None: sitios_a_buscar = [ sitio for sitio in sitios_a_buscar if sitio not in filtros] _encoder = ScrapyJSONEncoder(ensure_ascii=True) for site in sitios_a_buscar: if site == "steampowered": results = await runner.crawl(SteamSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo)) elif site == "nuuvem": results = await runner.crawl(NuuvemSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo)) elif site == "gog": results = await runner.crawl(GOGSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo)) elif site == "gamesplanet": results = await runner.crawl(GamesPlantetSpider, modo=modo, query=query, url_search=CONFIG_SITE[site]["url_search"], rango=(rango_minimo, rango_maximo)) output = return_spider_output(results, output_data, site) output_data = output if orden != "default": if orden != "relevancia": tipo_orden, indice_orden = orden.split("_") else: tipo_orden = orden indice_orden = "" output_data = sort_data(output_data, tipo_orden, indice_orden, query) return _encoder.encode(output_data)
class KafkaPipeline(object): def __init__(self, producer, topic): self.producer = producer self.topic = topic self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): item = dict(item) item['spider'] = spider.name msg = self.encoder.encode(item) self.producer.send_message(self.topic, msg) @classmethod def from_settings(cls, settings)
class SortedJsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): super().__init__(dont_fail=True, **kwargs) self.file = file self._kwargs.setdefault('indent', 4) self._kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**self._kwargs) self.items = [] def export_item(self, item): self.items.append(dict(self._get_serialized_fields(item))) def finish_exporting(self): data = self.encoder.encode(sorted(self.items, key=sort_key)) self.file.write(to_bytes(data, self.encoding))
class AddItemPipeline(object): """ Pushes serialized item into a RQ """ def __init__(self, host, port, db, queue_name, store_id): self.encoder = ScrapyJSONEncoder() self.store_id = store_id self.queue_name = queue_name self.server = redis.Redis(host, port, db) self.queue = rq.Queue(queue_name, connection=self.server) @classmethod def from_settings(cls, settings): host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) db = settings.get('REDIS_DB', 0) queue_name = settings.get('RQ_QUEUE', 'default') store_id = int(settings.get('STORE', 0)) return cls(host, port, db, queue_name, store_id) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): ''' ''' # {{{ ## get global Store URL mapping store_id = self.store_id if store_id is 0: raise DropItem('Not set the store and no Store URL mapping') ## assign queue parameters item['store_id'] = store_id callback = 'worker.save_product_to_db' event = self.encoder.encode(dict(queue=self.queue_name, value=item, time=time.time())) ## push item to redis queue self.queue.enqueue(callback, event) return item
class SendToBrokerPipeline(object): def __init__(self): self.publisher = Publisher('data_distributor') self.encoder = ScrapyJSONEncoder() def process_item(self, item, spider): #Runs sending broker in separate thread to prevent it from blocking #on single items return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): item_dict = dict(item) data = self.encoder.encode(item_dict) self.publisher.send_message(data,'articles') return item
class JsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write(b"[") def finish_exporting(self): self.file.write(b"]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b",\n") itemdict = dict(self._get_serialized_fields(item)) self.file.write(to_bytes(self.encoder.encode(itemdict)))
class JsonItemExporter(JsonLinesItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs) self.file = file self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write("[") def finish_exporting(self): self.file.write("]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(',\n') itemdict = dict(self._get_serialized_fields(item)) self.file.write(self.encoder.encode(itemdict))
def parse(self, response): items = [] for row in response.xpath('//table[@class="TblDataRecs"]/tr'): item = mylanguageexchangeItem() name = row.xpath('td[@class="userdata"]//a//b/text()').extract() item["name"] = [x.strip() for x in name] country = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[1]/td/text()').extract() item["country"] = [x.strip() for x in country] city = row.xpath('td[@class="userdata"][@data-th="Country(City)"]//tr[2]/td/text()').extract() item["city"] = [x.strip().strip('()') for x in city] native = row.xpath('td[@class="userdata"][@data-th="Native Language"]//td/text()').extract() item["native"] = [x.strip() for x in native] practicing = row.xpath('td[@class="userdata"][@data-th="Practicing Language"]//td/text()').extract() item["practicing"] = [x.strip() for x in practicing] desc = row.xpath('td[@class="userdata"][@data-th="Description"]//td/text()').extract() item["desc"] = [x.strip() for x in desc] items.append(item) _encoder = ScrapyJSONEncoder() with open('mylanguageexchange_crawled.json', 'w') as outfile: outfile.write(_encoder.encode(items))
class JsonItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.file = file kwargs.setdefault('ensure_ascii', not self.encoding) self.encoder = ScrapyJSONEncoder(**kwargs) self.first_item = True def start_exporting(self): self.file.write(b"[\n") def finish_exporting(self): self.file.write(b"\n]") def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',\n') itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) self.file.write(to_bytes(data, self.encoding))
class StatusMailer(object): def __init__(self, recipients, mail, compressor, crawler): self.recipients = recipients self.mail = mail self.encoder = ScrapyJSONEncoder(crawler=crawler) self.files = defaultdict(compressor) self.num_items = 0 self.num_errors = 0 @classmethod def from_crawler(cls, crawler): recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS') compression = crawler.settings.get('STATUSMAILER_COMPRESSION') if not compression: compressor = PlainCompressor elif compression.lower().startswith('gz'): compressor = GzipCompressor else: raise NotConfigured if not recipients: raise NotConfigured mail = MailSender.from_settings(crawler.settings) instance = cls(recipients, mail, compressor, crawler) crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped) crawler.signals.connect(instance.spider_error, signal=signals.spider_error) crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed) crawler.signals.connect(instance.request_received, signal=signals.request_received) return instance def item_scraped(self, item, response, spider): self.files[spider.name + '-items.json'].write(self.encoder.encode(item)) self.num_items += 1 def spider_error(self, failure, response, spider): self.files[spider.name + '.log'].write(failure.getTraceback()) self.num_errors += 1 def request_received(self, request, spider): self.files[spider.name + '.log'].write(str(request) + '\n') def spider_closed(self, spider, reason): files = [] for name, compressed in self.files.items(): files.append((name + compressed.extension, compressed.mimetype, compressed)) try: size = self.files[spider.name + '-items.json'].size except KeyError: size = 0 body='''Crawl statistics: - Spider name: {0} - Spider finished at: {1} - Number of items scraped: {2} - Number of errors: {3} - Size of scraped items: {4}'''.format( spider.name, datetime.datetime.now(), self.num_items, self.num_errors, format_size(size) ) return self.mail.send( to=self.recipients, subject='Crawler for %s: %s' % (spider.name, reason), body=body, attachs=files )
class ShopwareRESTPipeline(object): def __init__(self): """ Connect to Shopware REST Api using HTTP digest authentication. We need an ADMIN role with sufficient access to insert articles. Shopware4 (german) API Guide: http://wiki.shopware.de/_detail_861_487.html """ self.name = settings['SHOPWARE_SERVICE_NAME'] self.api_url = settings['SHOPWARE_API_BASE'] self.access_token = settings['SHOPWARE_TOKEN_KEY'] self.request_headers = {'Content-Type': 'application/json; charset=utf-8', 'Accept': 'application/json'} self.encoder = ScrapyJSONEncoder() self.node = {} # shopware minimal default item self.default_item = RestItem({ 'taxId': 1, #'tax': 19, 'name': 'nexus', 'mainDetail': { 'number': 'nex24', 'prices': [{ 'customerGroupKey': 'EK', 'basePrice': 16, 'price': 20, # shop will add VAT (if configured that way) }], # 'attribute': { # 'supplier_url': 'http://example.net', # 'supplierUrl': 'http://example.net', # # 'attr19': 'http://example.net', # }, }, 'active': True, 'supplier': 'example com', 'categories': [ {'id': 5,}, {'id': 3,}, ], 'images': [{ #'id': '1', ## this one is bugged in shopware (doesnt add image to article) #'mediaId': '1', # needs deduplication on update 'link': 'http://shopware.local/templates/_emotion/frontend/_resources/images/logo.jpg', }], 'attribute': { 'attr19': 'http://example.net', }, 'description': 'Some Article', 'descriptionLong': 'Some Article Description', }) def open_spider(self, spider): # TODO: test with async / multiple spiders open # (attach session to spider? self.session[spider]) # token = self.access_token, self.access_token_secret # self.session = self.oauth.get_session(token=token) self.requests = requests.Session() self.requests.auth=HTTPDigestAuth(self.name, self.access_token) # category mapping self.node[spider] = {} self.root = 4 # our! root category # self.SHOPWARE_ROOT_NODE r = self.requests.get( '%s/categories' % (self.api_url), headers=self.request_headers, allow_redirects=False, ) if r.status_code != 200: raise CloseSpider('API not available') resp = json.loads(r.content) if resp['success'] != True: raise CloseSpider('API returned failure') ident = spider.name if spider.ident: ident = spider.ident for cat in resp['data']: if cat['parentId'] != self.root: continue if cat['name'] == ident: self.node[spider] = cat if 'name' not in self.node[spider] or ident not in self.node[spider]['name']: # create spider category node # description = '[SCRAPER LOG] Category created at: %s' % datetime.now().strftime('%Y-%m-%dT%H:%M:%S') payload = {'name': ident, 'active': False, 'parentId': self.root} r = self.requests.post('%s/categories' % (self.api_url), headers=self.request_headers, data=json.dumps(payload)) resp = json.loads(r.content) r = self.requests.get(resp['data']['location'], headers=self.request_headers) resp = json.loads(r.content) self.node[spider] = resp['data'] # print self.node[spider]['id'] # # starting run... # description = '[SCRAPER LOG] Starting crawler run at: %s\n' % datetime.now().strftime('%Y-%m-%dT%H:%M:%S') # description += self.api.catalog_category.info(self.node[spider].get('category_id'), 0, ['description']).get('description') # self.api.catalog_category.update(self.node[spider].get('category_id'), {'description': description, 'available_sort_by': 'name'}) def close_spider(self, spider): # TODO: test with async / multiple spiders open # self.session.close() # finished run # description = self.api.catalog_category.info(self.node[spider].get('category_id'), 0, ['description']).get('description') # description += '\n[SCRAPER] Finished crawler run at: %s' % datetime.now().strftime('%Y-%m-%dT%H:%M:%S') # #description += '\n[SCRAPER] -- Run statistics:\n%s' % crawler_output # self.api.catalog_category.update(self.node[spider].get('category_id'), {'description': description, 'available_sort_by': 'name'}) if self.node[spider]: del self.node[spider] # def create_item(self, item, spider): # def update_item(self, item, spider): def process_item(self, item, spider): """ Push scraped items into Shopware. Do not go over CsvItemExporter, do not collect direct-sql speedups. """ # try insert first. if it fails because it exists (check error msg), do an update on :id # (todo) try to extend magento api to allow 'upserts' (POST create _or_ update if exists) itemdict = dict(self._get_serialized_fields(self.default_item)) # itemdict = dict(self._get_serialized_fields(dict(item.items() + self.default_item.items()))) payload = self.encoder.encode(itemdict) print payload # set proxies, user-agent? try: log.msg('API CALL: updating product', spider=spider, level=log.DEBUG) r = self.requests.put( '%s/articles/%s?useNumberAsId=1' % (self.api_url, itemdict['mainDetail']['number']), # API Endpoint headers=self.request_headers, # proxies=proxies, allow_redirects=False, data=payload, ) resp = r.content if r.status_code != 302: message = r.status_code resp = json.loads(r.content) if resp: message = resp['message'] else: message = r.content raise Fault('{0}: Update failed: {1}'.format(self.__class__.__name__, message)) log.msg('{0}: Product updated: {1}: {2}'.format(self.__class__.__name__, itemdict['mainDetail']['number'], resp), spider=spider, level=log.INFO) except Fault as fault: #if fault.faultCode == 620: # wrong # log.msg(fault.faultString, spider=spider, level=log.DEBUG) # Update failed: Article by id nex21 not found log.msg('API CALL: inserting product', spider=spider, level=log.DEBUG) r = self.requests.post( '%s/articles?useNumberAsId=1' % (self.api_url), # API Endpoint headers=self.request_headers, # proxies=proxies, allow_redirects=False, data=payload, ) if r.status_code != 201 or json.loads(r.content) == None: message = r.status_code resp = r.content message = 'unknown error' resp = json.loads(r.content) #if r.status_code == 201: if resp['success'] == True: data = json.loads(r.content) message = resp['data'] if resp['success'] == False: message = resp['message'] # insert error checking here, Exception on errors log.msg('{0}: Product inserted: "{1}" as id {2} at {3}'.format(self.__class__.__name__, itemdict['mainDetail']['number'], resp['data']['id'], resp['data']['location'] ), spider=spider, level=log.INFO) # non-blocking async version # f = self.requests.post # d = threads.deferToThread(f, # '%s/articles' % self.api_url, # API Endpoint # headers=self.request_headers, # allow_redirects=False, # data=payload, # ) # (function, *args, **kwargs) # d.addCallback(self.persist_test, info) # d.addErrback(log.err, self.__class__.__name__ + '.image_downloaded') #r = requests.head(url) # messages: # {"success":true,"data":{"id":2,"location":"http:\/\/shopware.local\/api\/articles\/2"}} # {"success":false,"message":"Resource not found"} # {"success":false,"message":"Validation error","errors":["tax: This value should not be blank","mainDetail.number: This value should not be blank"]} # {"success":false,"message":"Tax by taxrate 1 not found"} # {"success":false,"message":"Errormesage: SQLSTATE[23000]: Integrity constraint violation: 1062 Duplicate entry 'sku' for key 'ordernumber'"} # -> retry as PUT (update) # {"success":false,"message":"Customer Group by key not found"} return item export_empty_fields = False fields_to_export = None encoding = 'utf-8' def _get_serialized_fields(self, item, default_value=None, include_empty=None): """Return the fields to export as an iterable of tuples (name, serialized_value) """ if include_empty is None: include_empty = self.export_empty_fields if self.fields_to_export is None: if include_empty: field_iter = item.fields.iterkeys() else: field_iter = item.iterkeys() else: if include_empty: field_iter = self.fields_to_export else: nonempty_fields = set(item.keys()) field_iter = (x for x in self.fields_to_export if x in nonempty_fields) for field_name in field_iter: if field_name in item: field = item.fields[field_name] value = self.serialize_field(field, field_name, item[field_name]) else: value = default_value yield field_name, value def serialize_field(self, field, name, value): serializer = field.get('serializer', self._to_str_if_unicode) return serializer(value) def _to_str_if_unicode(self, value): return value.encode(self.encoding) if isinstance(value, unicode) else value