class PickleExportPipeline(object): def __init__(self): self.files = dict() self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s.pickle' % spider.name, 'w+b') self.files[spider] = file self.exporter = PickleItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class PickleExporterPipeline(object): def __init__(self, file_name): # Storing output filename self.file_name = file_name # Creating a file handle and setting it to None self.file_handle = None @classmethod def from_crawler(cls, crawler): return cls(file_name=crawler.settings.get('PICKLE_PATH'), ) def open_spider(self, spider): print('Custom export opened') # Opening file in binary-write mode file = open(self.file_name, 'wb') self.file_handle = file # Creating a FanItemExporter object and initiating export self.exporter = PickleItemExporter(file) self.exporter.start_exporting() def close_spider(self, spider): print('Custom Exporter closed') # Ending the export to file from FanItemExport object self.exporter.finish_exporting() # Closing the opened output file self.file_handle.close() def process_item(self, item, spider): # passing the item to FanItemExporter object for expoting to file self.exporter.export_item(item) return item
def test_nonstring_types_item(self): item = self._get_nonstring_types_item() fp = BytesIO() ie = PickleItemExporter(fp) ie.start_exporting() ie.export_item(item) ie.finish_exporting() self.assertEqual(pickle.loads(fp.getvalue()), item)
def test_export_multiple_items(self): i1 = TestItem(name='hello', age='world') i2 = TestItem(name='bye', age='world') f = BytesIO() ie = PickleItemExporter(f) ie.start_exporting() ie.export_item(i1) ie.export_item(i2) ie.finish_exporting() f.seek(0) self.assertEqual(pickle.load(f), i1) self.assertEqual(pickle.load(f), i2)
def test_export_multiple_items(self): i1 = self.item_class(name="hello", age="world") i2 = self.item_class(name="bye", age="world") f = BytesIO() ie = PickleItemExporter(f) ie.start_exporting() ie.export_item(i1) ie.export_item(i2) ie.finish_exporting() f.seek(0) self.assertEqual(self.item_class(**pickle.load(f)), i1) self.assertEqual(self.item_class(**pickle.load(f)), i2)
class PickleWriterPipeline(object): def __init__(self): self.file_name = "class.pickle" self.file_path = join(dirname(dirname(abspath(__file__))), self.file_name) self.file = open(self.file_path, 'wb') self.exporter = PickleItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ProxyPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('data/%s_Items.p' % spider.name, 'w+b') self.files[spider] = file self.exporter = PickleItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item['type'] == 'transparent': raise DropItem("Transparent Proxy Dropped") try: socket="http://{0}:{1}".format(item['ip'],item['port']) proxyDict = {"http":socket} response = rq.get('http://www.google.com',proxies=proxyDict,timeout=2) elapsed = response.elapsed spider.logger.info('Socket{0}\tElapsed{1}'.format(socket,elapsed)) if not ( 200 <= response.status_code < 300): raise DropItem("Not valid respose") if elapsed>timedelta(seconds=5): raise DropItem("Slow connection") except Exception as e: raise DropItem("Cannot Connect") item["speed"] = elapsed item["lastcheck"] = date.today() self.exporter.export_item(item) return item