def test_export_binary(self): with catch_warnings(): filterwarnings('ignore', category=ScrapyDeprecationWarning) exporter = PythonItemExporter(binary=True) value = self.item_class(name='John\xa3', age='22') expected = {b'name': b'John\xc2\xa3', b'age': b'22'} self.assertEqual(expected, exporter.export_item(value))
def __init__(self, crawler): self.hsref = hsref.hsref self.pipe_writer = pipe_writer self.crawler = crawler self._write_item = self.pipe_writer.write_item # https://github.com/scrapy/scrapy/commit/c76190d491fca9f35b6758bdc06c34d77f5d9be9 exporter_kwargs = {'binary': False} if not IS_PYTHON2 else {} with ignore_warnings(category=ScrapyDeprecationWarning): self.exporter = PythonItemExporter(**exporter_kwargs)
def __init__(self): self.client_id = settings.STATNUTS_CLIENT_ID self.client_secret = settings.STATNUTS_SECRET self.sn_store_url = settings.STATNUTS_URL + 'scrap/teams/' self.token_url = settings.STATNUTS_URL + 'o/token/' self.access_token = None self.oauth = OAuth2Session(client=LegacyApplicationClient( client_id=self.client_id)) self.exporter = PythonItemExporter(binary=False)
class ScrapynutsPostTeamStatnutsPipeline(object): def __init__(self): self.client_id = settings.STATNUTS_CLIENT_ID self.client_secret = settings.STATNUTS_SECRET self.sn_store_url = settings.STATNUTS_URL + 'scrap/teams/' self.token_url = settings.STATNUTS_URL + 'o/token/' self.access_token = None self.oauth = OAuth2Session(client=LegacyApplicationClient( client_id=self.client_id)) self.exporter = PythonItemExporter(binary=False) def _get_access_token(self): token = self.oauth.fetch_token(token_url=self.token_url, client_id=self.client_id, verify=False, client_secret=self.client_secret, username='******', password='******') return token def process_item(self, item, spider): if self.access_token is None: self.access_token = self._get_access_token() item_json = self.exporter.export_item(item) team_name = item_json.get('name') + '/' self.oauth.post(urljoin(self.sn_store_url, team_name), json=item_json) print('Item stored with hash = %s' % item['name'])
class HubstorageExtension(object): """Extension to write scraped items to HubStorage""" def __init__(self, crawler): self.hsref = hsref.hsref self.pipe_writer = pipe_writer self.crawler = crawler self._write_item = self.pipe_writer.write_item # https://github.com/scrapy/scrapy/commit/c76190d491fca9f35b6758bdc06c34d77f5d9be9 exporter_kwargs = {'binary': False} if not IS_PYTHON2 else {} with ignore_warnings(category=ScrapyDeprecationWarning): self.exporter = PythonItemExporter(**exporter_kwargs) @classmethod def from_crawler(cls, crawler): o = cls(crawler) crawler.signals.connect(o.item_scraped, signals.item_scraped) crawler.signals.connect(o.spider_closed, signals.spider_closed) return o def item_scraped(self, item, spider): if not isinstance(item, (dict, BaseItem)): log.msg("Wrong item type: %s" % item, level=logging.ERROR) return type_ = type(item).__name__ item = self.exporter.export_item(item) item.setdefault("_type", type_) self._write_item(item) def spider_closed(self, spider, reason): self.pipe_writer.set_outcome(reason)
class HubstorageExtension(object): """Extension to write scraped items to HubStorage""" def __init__(self, crawler): self.hsref = hsref.hsref if not self.hsref.enabled: raise NotConfigured self.crawler = crawler self._write_item = self.hsref.job.items.write self.exporter = PythonItemExporter(binary=False) log.msg("HubStorage: writing items to %s" % self.hsref.job.items.url) @classmethod def from_crawler(cls, crawler): o = cls(crawler) crawler.signals.connect(o.item_scraped, signals.item_scraped) crawler.signals.connect(o.spider_closed, signals.spider_closed) return o def item_scraped(self, item, spider): type_ = type(item).__name__ item = self.exporter.export_item(item) item.setdefault("_type", type_) self._write_item(item) def spider_closed(self, spider, reason): # flush item writer self.hsref.job.items.flush() # update outcome self.hsref.job.metadata.update(close_reason=reason) self.hsref.job.metadata.save()
def __init__(self, crawler): self.hsref = hsref.hsref if not self.hsref.enabled: raise NotConfigured self.crawler = crawler self._write_item = self.hsref.job.items.write self.exporter = PythonItemExporter(binary=False) log.msg("HubStorage: writing items to %s" % self.hsref.job.items.url)
def spider_handler(latitude, longitude, max_number, q): link = get_link_for_tripadvisor(latitude, longitude) output = [] _exporter = PythonItemExporter(binary=False) def get_crawler_output(signal, sender, item, response, spider): output.append(_exporter.export_item(item)) dispatcher.connect(get_crawler_output, signal=signals.item_scraped) process = CrawlerProcess({ "USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)" }) process.crawl(RestaurantSpider, start_url=link, max_restaurants=max_number) process.start() q.put(output)
class ScrapyKafkaTopicWriter(KafkaTopicWriter): """ Kafka writer which knows how to handle Scrapy items: they are serialized to JSON, and "_id" field is used as Kafka key if present. """ def __init__(self, *args, **kwargs): self._encoder = ScrapyJSONEncoder() self._exporter = PythonItemExporter(binary=False) kwargs.setdefault('value_serializer', self._serialize_value) super(ScrapyKafkaTopicWriter, self).__init__(*args, **kwargs) def write_item(self, item): key = item.get('_id', None) msg = self._exporter.export_item(item) return self.write(key, msg) def _serialize_value(self, value): return self._encoder.encode(value).encode('utf8')
def _serialize(self, item, **kwargs): e = PythonItemExporter(binary=False, **kwargs) return e.export_item(item)
def __init__(self, *args, **kwargs): self._encoder = ScrapyJSONEncoder() self._exporter = PythonItemExporter(binary=False) kwargs.setdefault('value_serializer', self._serialize_value) super(ScrapyKafkaTopicWriter, self).__init__(*args, **kwargs)
def test_export_binary(self): exporter = PythonItemExporter(binary=True) value = TestItem(name=u'John\xa3', age=u'22') expected = {b'name': b'John\xc2\xa3', b'age': b'22'} self.assertEqual(expected, exporter.export_item(value))
def test_invalid_option(self): with self.assertRaisesRegexp(TypeError, "Unexpected options: invalid_option"): PythonItemExporter(invalid_option='something')
def _get_exporter(self, **kwargs): return PythonItemExporter(binary=False, **kwargs)
def test_export_binary(self): exporter = PythonItemExporter(binary=True) value = TestItem(name=u"John\xa3", age=u"22") expected = {b"name": b"John\xc2\xa3", b"age": b"22"} self.assertEqual(expected, exporter.export_item(value))
class FileHandler(object): data = { "candidate_basic": { "header": None }, "candidate_education": { "header": None }, "candidate_research": { "header": None }, "candidate_publications": { "header": None }, "candidate_courses": { "header": None }, "candidate_workexperience": { "header": None } } def __init__(self): self.json_exporter = PythonItemExporter() @staticmethod def generate_id(string): sha1 = hashlib.sha1() sha1.update(string) return sha1.hexdigest() def cleanup_data(self, spider, fmt): directory = "eol_spider/%s/%s" % (fmt, spider.name) if os.path.exists(directory): shutil.rmtree(directory) os.mkdir(directory) for key in self.data.keys(): self.data[key]['path'] = "%s/%s.%s" % (directory, key, fmt) self.data[key]['f'] = open(self.data[key]['path'], "w+") def write(self, fmt="json"): for key in self.data.keys(): items = self.data[key]["item"] if not isinstance(items, list): items = [items] for item in items: if fmt == "csv": if not self.data[key]["header"]: header = self.build_csv_header(item) self.data[key]["f"].write(header) self.data[key]["header"] = header content = self.build_content(item, fmt) self.data[key]["f"].write(content) def build_content(self, item, fmt): if fmt == "json": content = json.dumps(self.json_exporter.export_item(item)) + "\n" elif fmt == "csv": content = "" for key in item: content += item[key] + "\t" content = content[:-1] content += "\n" pass return content @staticmethod def build_csv_header(item): header = "" for k in item: header += k + "\t" header = header[:-1] header += "\n" return header def close(self): for key in self.data.keys(): self.data[key]["f"].close()
def __init__(self): self.json_exporter = PythonItemExporter()
def _get_exporter(self, **kwargs): return PythonItemExporter(**kwargs)
def test_export_binary(self): exporter = PythonItemExporter(binary=True) value = self.item_class(name="John\xa3", age="22") expected = {b"name": b"John\xc2\xa3", b"age": b"22"} self.assertEqual(expected, exporter.export_item(value))