class FinancePipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_stock.csv' % spider.code, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file, fields_to_export=['date','Open','High', 'Low', 'Close', 'Volume', 'AdjClose']) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['team_year', 'track', 'region'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsvExportPipeline(object): def process_item(self, item, spider): outputdir = '%s%s/%s' % (settings['ADAPTFM_OUTPUT_PATH'], spider.folder, item['brandCategory'][0]) name = item['brandFeed'][0].replace('http://','').replace('/','_').replace('.xml','') filename = '%s/%s.csv' % (outputdir, name) if not os.path.isdir (os.path.dirname(filename)): os.mkdir(os.path.dirname(filename)) file = open(filename, 'a+b') self.exporter = CsvItemExporter(file) self.exporter.start_exporting() self.exporter.export_item(item) self.exporter.finish_exporting() file.close() return item
class CsvExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_%s.csv' % (spider.name, int(time.time())), 'w+b') self.files[spider] = file if 'yopt' in spider.name: self.exporter = CsvItemExporter(file,fields_to_export = ['date','instrument','option_symbol','symbol','expiration','type','strike','last','change','bid','ask','volume','open_int'],dialect='excel') elif 'prices' in spider.name: self.exporter = CsvItemExporter(file,fields_to_export = ['date','open','high','low','close','volume','adj_close'],dialect='excel') self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item is None: raise DropItem("None") self.exporter.export_item(item) return item
class EaCOpenListBotPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_%s_items.csv' % (spider.name, spider.category), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['vendor', 'product', 'default'] #self.exporter.fields_to_export = ['default'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ChainxyPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) # self.exporter.fields_to_export = ['company_name','contact','phone1','phone2','email','average','reviews','address','member_for','based_in'] self.exporter.fields_to_export = ['title', 'save', 'desc', 'long_desc', 'image'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ParkingCrawlerPipeline(object): def __init__(self): self.files = {} def process_item(self, item, spider): return item @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('/tmp/%s_log.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['_id', 'name', 'count', 'free', 'timestamp', 'lat', 'lon', 'url'] self.exporter.include_headers_line = 'false'; self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TutorialPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open("mediabase.csv", 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) #self.exporter.fields_to_export = ["Name","Address","City","Neighborhood","State","Zip","Phone","Website","Image_url","Hours_Mon","Hours_Tue","Hours_Wed","Hours_Thu","Hours_Fri","Hours_Sat","Hours_Sun","Price","TakesReservation","Delivery","TakeOut","AcceptsCreditCards","GoodFor","Parking","WheelChairAccessible","BikeParking","GoodForKids","GoodForGroups","Attire","Ambience","NoiseLevel","Alcohol","OutDoorSeating","Wifi","HasTV","WaiterService","Caters","Url"] self.exporter.fields_to_export = ["Type","Area","PlaceName","Web","Tel","Address","Zip","Town","Hours","CompanyName","OrganizationNo","Turnover","Employed","LastName","FirstName","Telephone","AllabolagUrl","EniroUrl"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'Subject', 'Start_Date', 'Start_Time', 'End_Date', 'End_Time', 'Location', 'All_Day_Event' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TutorialPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) list = ['id','title', 'time', 'director', 'year', 'star','cost'] self.exporter.fields_to_export = list self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): try: fo = open(spider.output_file, 'w+b') except IOError as e: spider.crawler.engine.close_spider(spider, "ERROR: Can't create CSV file: " + str(e)) return self.files[spider] = fo self.exporter = CsvItemExporter(fo) self.exporter.fields_to_export = settings.getlist("EXPORT_FIELDS") self.exporter.start_exporting() def spider_closed(self, spider): if self.exporter is not None: self.exporter.finish_exporting() f = self.files.pop(spider) f.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def assertExportResult(self, item, expected, **kwargs): fp = BytesIO() ie = CsvItemExporter(fp, **kwargs) ie.start_exporting() ie.export_item(item) ie.finish_exporting() self.assertCsvEqual(fp.getvalue(), expected)
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_result.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'title', 'brand', 'description', 'price', 'main_image_url', 'additional_image_urls', 'sku', 'category' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class YangmaodangPipeline(object): ''' 保存采集的水木羊毛信息,将其保存到csv文件中,并将其传到邮箱中。 ''' def __init__(self): self.filename = 'output/newsmth-'+time.strftime('%Y%m%d')+'.csv' self.file = open(self.filename, 'wb') self.items = [] # self.file.write('$$'.join(YangmaodangItem.fields)) def open_spider(self, spider): self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): # 利用回复数对文章排序 sortedlist = sorted(self.items, key=lambda x: int(operator.itemgetter('reply_num')(x)), reverse=True) for item in sortedlist: self.exporter.export_item(item) self.exporter.finish_exporting() self.file.close() send_email(self.filename) def process_item(self, item, spider): self.items.append(item) # self.exporter.export_item(item) return item
class ChainxyPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open( '%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'Sport_name', 'Date', 'Time', 'Team1_name', 'Team1_points', 'Team1_spread', 'Team1_win', 'Team1_total', 'Team2_name', 'Team2_points', 'Team2_spread', 'Team2_win', 'Team2_total', 'Draw', 'last_update' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ChainxyPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open( '%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'title', 'desc', 'image', 'stars', 'quality', 'imdb_code', 'keywords', 'genres', 'year', 'first_air_date', 'eps', 'type', 'server_f1', 'server_f2', 'vidnode', 'rapidvideo', 'streamango', 'openload1', 'openload2' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVWriterPipeline(object): filename = "" def __init__(self): self.files = {} self.exporter1 = CsvItemExporter( fields_to_export=BillionPricesIndiaItem.fields.keys(), file=open("mobiles.csv", 'wb')) @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.exporter1.start_exporting() def spider_closed(self, spider): self.exporter1.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter1.export_item(item) return item
class BuildingsPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open("buildings.csv", "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = fields_to_export self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class OpossumPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('/home/moorcock/work/mrs_opossum/items.csv', 'w+b') self.exporter = CsvItemExporter(self.file) self.exporter.fields_to_export = ['id', 'title', 'image', 'keywords'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): item_exp = ExportImageItem( id=item['id'], title=item['title'].strip(' \t\n'), image=item['images'][0]['path'].split('/')[-1].split('.')[0], keywords=item['keywords'] ) self.exporter.export_item(item_exp) return item_exp
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['name', 'rank', 'overallScore', 'teachingScore', 'internationalOutlook', 'industryIncome', 'research', 'citations', 'textBelow'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ProjetodsPipeline(object): # Create the csv file. def __init__(self): self.file = open("booksdata.csv", 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() # Process each job detail def process_item(self, item, spider): #Clean text (punctuation) translator = str.maketrans('', '', string.punctuation) item['title'] = item['title'].translate(translator) item['local'] = item['local'].translate(translator).replace(" – ", " ") item['company_name'] = item['company_name'].translate(translator) # Remove HTML Markup soup = BeautifulSoup(item['description']) item['description'] = soup.get_text(" ", strip=True).translate(translator) # Treat empty field if item['salary'] is None: item['salary'] = "NA" else: item['salary'] = item['salary'].replace(",", ".") self.exporter.export_item(item) return item
class DataPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['id','first_name', 'last_name','company', 'email','title','city', 'state', 'zip_code','country', 'address', 'address2', 'headquarter_phone', 'contact_phone','updated'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) def spider_opened(self, spider): result_file = open('crawler products.csv', 'w+b') self.files[spider] = result_file self.exporter = CsvItemExporter(result_file) self.exporter.fields_to_export = [ 'name', 'image', 'link', 'model', 'upc', 'ean', 'currencycode', 'locale', 'price', 'saleprice', 'sku', 'retailer_key', 'instore', 'shiptostore', 'shippingphrase', 'productstockstatus', 'categories', 'gallery', 'features', 'condition' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() result_file = self.files.pop(spider) result_file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class GnewsPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file,True,'\n') self.exporter.fields_to_export=['category','topstory','snippet','link','originallink','sublinks','sublinktext','gpost','gpostsnip','extras','extraslink','related'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item '''def process_item(self, item, spider):
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['player', 'round_1', 'round_2', 'round_3', 'round_4', 'round_5', 'round_6', 'round_7', 'round_8', 'round_9', 'round_10', 'round_11', 'round_12', 'round_13', 'round_14', 'round_15', 'round_16', 'round_17', 'round_18', 'round_19', 'round_20', 'round_21', 'round_22', 'round_23'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class OregonPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['bill','committee','session','text','url', 'filename'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class GanjiPipeline(object): def __init__(self): self.target_files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def gen_filename(self, spider): return '.'.join([spider.name, datetime.datetime.now().strftime('%Y%m%d%H%M%S'), 'csv']) def spider_opened(self, spider): target_file = open(self.gen_filename(spider), 'wb') self.target_files[spider] = target_file self.exporter = CsvItemExporter(target_file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() target_file = self.target_files.pop(spider) target_file.close() def process_item(self, item, spider): if not (item.get('price') and item.get('summary')): raise DropItem('not price or summary') self.exporter.export_item(item) return item
class FtcompanydataPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class DoubantvPipeline(CsvItemExporter): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): print('==========pipeline==========from_crawler==========') pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): filename = 'douban_tv_hanju.csv' savefile = open(filename, 'wb+') self.files[spider] = savefile print('==========pipeline==========spider_opened==========') self.exporter = CsvItemExporter(savefile) self.exporter.start_exporting() def spider_closed(self, spider): print('==========pipeline==========spider_closed==========') self.exporter.finish_exporting() savefile = self.files.pop(spider) savefile.close() def process_item(self, item, spider): print('==========pipeline==========process_item==========') print(type(item)) self.exporter.export_item(item) return item
class WikipediaEventCSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['date','day_of_week','category','sub_category','news_header','source_names', 'source_list'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ChainxyPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open( '%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'store_name', 'store_number', 'address', 'address2', 'city', 'state', 'zip_code', 'country', 'phone_number', 'latitude', 'longitude', 'store_hours', 'store_type', 'other_fields', 'coming_soon' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open("{}_{}_{}__{}_{}_{}.csv".format(spider.pdda, spider.pddm, spider.pddj, spider.pdfa, spider.pdfm, spider.pdfj), 'a+b') self.files[spider] = file kwargs = {} kwargs['delimiter'] = ';' kwargs['quoting'] = csv.QUOTE_ALL self.exporter = CsvItemExporter(file, include_headers_line=False, **kwargs) self.exporter.fields_to_export = ["name", "address", "zipcode", "city", "number", "date"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class YangmaodangPipeline(object): ''' 保存采集的水木羊毛信息,将其保存到csv文件中,并将其传到邮箱中。 ''' def __init__(self): self.filename = 'output/newsmth-' + time.strftime('%Y%m%d') + '.csv' self.file = open(self.filename, 'wb') self.items = [] # self.file.write('$$'.join(YangmaodangItem.fields)) def open_spider(self, spider): self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): # 利用回复数对文章排序 sortedlist = sorted( self.items, key=lambda x: int(operator.itemgetter('reply_num')(x)), reverse=True) for item in sortedlist: self.exporter.export_item(item) self.exporter.finish_exporting() self.file.close() send_email(self.filename) def process_item(self, item, spider): self.items.append(item) # self.exporter.export_item(item) return item
class CsvExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_%s.csv' % (spider.name, int(time.time())), 'w+b') self.files[spider] = file #self.exporter = CsvItemExporter(file,fields_to_export = ['pid','price','curr','date','source','title','heading','url','sku','in_stock','image'],dialect='excel',delimiter=';') self.exporter = CsvItemExporter(file, fields_to_export=['product_id', 'price', 'price_usd', 'currency', 'when_created', 'source', 'title', 'heading', 'url', 'in_stock', 'image'], dialect='excel') self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item is None: raise DropItem("None") self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['brand','name','division','category','price','image_link'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsdnPipeline(object): users={} def __init__(self): self.file=None self.exporter=None def set_file(self,filename): self.file=open(filename,'wb') self.exporter=CsvItemExporter(self.file) def process_item(self, item, spider): if spider.name=="csdn.user": if self.file is None: self.set_file("export_users.csv") else: self.exporter.export_item(item) if spider.name=="csdn.login": if item['username']: sql='update t_csdn_users set real_password=:password,real_email=:email where username=:username'; username=item['username'] password=item['password'] email=item['email'] param={'username':username,'password':password,'email':email} spider.oracle_db.execute_sql(sql,param,False) log.msg("username:"******"\tpassword:"+password,level=log.INFO) return item
class TutorialPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) list = ['id', 'title', 'time', 'director', 'year', 'star', 'cost'] self.exporter.fields_to_export = list self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class GameListingPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_products.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): keys = ['name','address','zipCode','jobCostMin','jobCostMax', 'contactName','contactPhone','website','licenseNumber', 'averageRating','profileUrl','followers','following', 'badgeCount','projectCount','reviewCount','commentCount'] dictionary = item_to_dictionary(item,keys) # print 'document to insert',dictionary client.insert('updatedListings', dictionary, callback=insert_callback) # client.insert('listings', dictionary, callback=insert_callback) self.exporter.export_item(item)
class CrawlerDataPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('data.csv', 'w+b') self.exporter = CsvItemExporter(self.file) self.exporter.fields_to_export = [ 'product_asin', 'product_name', 'product_is_have_patten', 'product_description', 'image_link', 'original_image', 'color', 'patten', 'price', 'imported_code' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['scrape_date', 'scrape_time', 'strain_name', 'strain_type', 'website', 'strain_highlights', 'num_ratings', 'avg_rating', 'num_of_review', 'flavor_one', 'flavor_two', 'flavor_three', 'effect_one', 'effect_one_score', 'effect_two', 'effect_two_score', 'effect_three', 'effect_three_score', 'effect_four', 'effect_four_score', 'effect_five', 'effect_five_score', 'medical_one', 'medical_one_score', 'medical_two', 'medical_two_score', 'medical_three', 'medical_three_score', 'medical_four', 'medical_four_score', 'medical_five', 'medical_five_score', 'ailment_one', 'ailment_two', 'ailment_three', 'ailment_four', 'ailment_five', 'negative_one', 'negative_one_score', 'negative_two', 'negative_two_score', 'negative_three', 'negative_three_score', 'negative_four', 'negative_four_score', 'negative_five', 'negative_five_score', 'most_popular_one', 'most_popular_two', 'most_popular_three', 'most_popular_four', 'most_popular_five', 'most_popular_six', 'most_popular_seven', 'most_popular_eight', 'most_popular_nine', 'most_popular_ten'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MultiCSVItemPipeline(object): def __init__(self): self.files = {} self.exporter1 = CsvItemExporter(fields_to_export=ProfRatingItem.fields.keys(),file=open("profRating.csv",'wb')) self.exporter2 = CsvItemExporter(fields_to_export=ProfSummaryItem.fields.keys(),file=open("profSummary.csv",'wb')) @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.exporter1.start_exporting() self.exporter2.start_exporting() def spider_closed(self, spider): self.exporter1.finish_exporting() self.exporter2.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter1.export_item(item) self.exporter2.export_item(item) return item
class AirtspidersPipeline(object): def __init__(self): self.files = {SIXCOMPDATA.csv} def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'Company Name', 'Current Price', 'Previous Close', 'Day\'s Range', 'Historical Volatility', 'Market Cap', 'Shares Outstanding', 'EPS', 'P/E Ratio', 'Beta (Volatility)', 'Percent Held by Institutions' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ChainxyPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["name","number","item_type","location","building","bedroom","bathroom","size","title_deep_number","description","date","link","photo"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ExportCSV(object): """ Exporting to export/csv/spider-name.csv file """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_to_save = open('exports/csv/%s.csv' % spider.name, 'w+b') self.files[spider] = file_to_save self.exporter = CsvItemExporter(file_to_save) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_to_save = self.files.pop(spider) file_to_save.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class PwdhoundsPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'id', 'name', 'link', 'index', 'parent_id' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.include_headers_line=False self.exporter.fields_to_export = ["url","status","date","mls","address","price","beds","baths","homesize","lotsize","description","images"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsvExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.filepath = os.path.abspath(spider.output_file) file = open(self.filepath, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() log.msg('CSV output file location: "%s"' % self.filepath) def process_item(self, item, spider): self.exporter.export_item(item) return item
class YoutubespiderPipeline(object): def __init__(self): """ Define the CSVItemExporter for the YouTubeDataModel. Item Exportation, file encoding and the sequence of fields defined. """ self.csv_exporter = CsvItemExporter(open('data-master.csv', 'wb')) self.csv_exporter.encoding = 'utf-8' self.csv_exporter.fields_to_export = [ 'url', 'title', 'views', 'likes', 'dislikes', 'channel_name', 'publish_date', 'channel_subscriber_count' ] self.csv_exporter.start_exporting() def spider_closed(self, spider): self.csv_exporter.finish_exporting() def process_item(self, item, spider): """ Exports item through Item Exporter :param item: containing the data :param spider: spider that extracted and saved inside item :return: the item itself """ self.csv_exporter.export_item(item) return item
class MroPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): if not getattr(crawler.spider, '_custom_csv', False): return None pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open( getattr(spider, 'output_filename', 'result_{}.csv'.format(spider.name)), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = getattr(spider, 'output_fields', None) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.files.pop(spider).close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) # self.exporter.fields_to_export = ['title','author','country','note','publish_date','press','Score','Star','People_nums'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ScarpylinkPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_products.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): string = '<a href=\'%s\' title=\'%s\'>%s</a>' % ( item['url'], item['title'], item['title']) print(string) self.exporter.export_item(item) return item
class CsvExportPipeline(object): def __init__(self): # self.duplicates = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): # self.duplicates[spider] = set() file = open("%s_%s.csv" % (spider.name, int(time.time())), "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file, fields_to_export=["description", "phone"]) self.exporter.start_exporting() def spider_closed(self, spider): # del self.duplicates[spider] self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): # if item['description'] in self.duplicates[spider]: # raise DropItem("Duplicateitemfound: %s" % item) # else: # self.duplicates[spider].add(item['description']) # self.exporter.export_item(item) # return item if item is None: raise DropItem("None") self.exporter.export_item(item) return item
class AsiaOddsPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): if isinstance(spider, MatchSpider): file = open('%s_asia_%s.csv' % (spider.name,spider.match_date), 'w') else: file = open('%s_output.csv' % spider.name, 'w') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def process_item(self, item, spider): if isinstance(item, AsiaOddsItem): self.exporter.export_item(item) #raise DropItem("AsiaOdds item handled.") return item else: return item def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close()
class CVSExport(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('postUGR_items.csv', 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MakkanPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['data_id','Building_name','config_type','Selling_price','Monthly_Rent','lat','longt','platform','city','listing_date','txn_type','property_type','locality','sqft','Status','listing_by','name_lister','Details','address','price_on_req','sublocality','age','google_place_id','immediate_possession','mobile_lister','areacode','management_by_landlord','carpet_area','updated_date'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class LandpinPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open( '%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'url', 'state', 'county', 'apn', 'gps', 'size', 'price', 'zoning', 'legal_description' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WriteToCsv(object): @classmethod def process_item(self, item, spider): self.file = open(settings.csv_file_path, 'ab+') self.exporter = CsvItemExporter(self.file, include_headers_line=False) self.exporter.fields_to_export = settings.csv_export_fields self.exporter.export_item(item) return item
def test_header_export_two_items(self): for item in [self.i, dict(self.i)]: output = BytesIO() ie = CsvItemExporter(output) ie.start_exporting() ie.export_item(item) ie.export_item(item) ie.finish_exporting() self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')