class CsvExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_societies.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['name', 'president', 'email', 'url', 'facebook', 'membership', 'about', 'date_established'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # file = open('%s_data.xml' % spider.name, 'w+b') import os filePath = os.path.dirname(__file__) outputDir = filePath +'/output/' file = open(outputDir + '%s_data.csv' % spider.name, 'w+b') self.files[spider] = file # self.exporter = JsonItemExporter(file) self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class FacupPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline #create files and instantiate exporter class #then run start_exporting() method, this is required for item exporter class def spider_opened(self, spider): self.results_csv = open('results_3.csv', 'wb') self.missing_csv = open('results_miss_2.csv', 'wb') self.results_exporter = CsvItemExporter(self.results_csv) self.missing_exporter = CsvItemExporter(self.missing_csv) self.results_exporter.start_exporting() self.missing_exporter.start_exporting() def process_item(self, item, spider): self.results_exporter = CsvItemExporter(self.results_csv) self.missing_exporter = CsvItemExporter(self.missing_csv) return item def spider_closed(self, spider): self.results_exporter.finish_exporting() self.missing_exporter.finish_exporting() self.results_csv.close() self.missing_csv.close()
class CSVWriterPipeline(object): def __init__(self,filename): self.filename = filename @classmethod def from_crawler(cls, crawler): settings = crawler.settings filename = settings.get('OUTPUT_FILE') pipeline = cls(filename) crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open(self.filename, 'w+b') self.exporter = CsvItemExporter(self.file,include_headers_line=True) self.exporter.encoding='utf-8' self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def assertExportResult(self, item, expected, **kwargs): fp = BytesIO() ie = CsvItemExporter(fp, **kwargs) ie.start_exporting() ie.export_item(item) ie.finish_exporting() self.assertCsvEqual(fp.getvalue(), expected)
class DumpToFile(object): """ Dump harvested data into flat file, no other logic is implemented here (it's "Dump" :-) """ def __init__(self): self.files = {} self.counter = 0 @classmethod def from_crawler(cls, crawler): pipeline = cls() # TODO: verify if still needed for registration of spider_closed/opened event? crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): filename = spider.get_dump_filepath() f = open(filename, 'w') self.files[spider.name] = f # by default csv module uses Windows-style line terminators (\r\n) self.exporter = CsvItemExporter(f, include_headers_line=True, delimiter='|', lineterminator='\n') self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() f = self.files.pop(spider.name) f.close() def process_item(self, item, spider): self.exporter.export_item(item) # for counter, could set att in spider at closing self.counter += 1 return item
class CsvExportPipeline(object): """ app.pipelines.exporter_csv.CsvExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_csv = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file_csv self.exporter = CsvItemExporter(file_csv) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_csv = self.files.pop(spider) file_csv.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s' % spider.nameOfFile, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['originalString', 'translatedString'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsvExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('vagas.csv', 'wb') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WebcrawlerPipeline(object): def __init__ (self): self.files = {} pass @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open("%s_urls.txt" % (spider.name), "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file, include_headers_line=False) self.exporter.start_exporting() pass def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() pass def process_item(self, item, spider): self.exporter.export_item(item) return item pass
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["filename", "titel", "publicatie", "dossiernummer", "organisatie", "publicatiedatum", "publicatietype", "file_urls"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file,delimiter='\t') self.exporter.fields_to_export = ['userId','bookId','name','rating','relativeRating','booklistNum'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class AmazonCsvPipeline(object): def open_spider(self, spider): # 保存csv数据的文件对象 self.f = open("Amazon_goods_crawl.csv", "w") # 创建csv文件读写对象 self.csv_exporter = CsvItemExporter(self.f) # 开始进行csv文件读写 self.csv_exporter.start_exporting() # 根据商品标题进行去重处理 self.add_title = set() def process_item(self, item, spider): if item['title'] in self.add_title: print u'[EEROR] 数据已保存,勿重复%s'% item['title'] else: self.add_title.add(item['title']) # 每次写入一个item数据 # print u'[INFO] 正在写入csv文件中%s'% item['title'] self.csv_exporter.export_item(item) return item def close_spider(self, spider): # 结束csv文件读写 # print u'[INFO] 写入csv文件已完成' self.csv_exporter.finish_exporting() # 关闭文件 self.f.close()
class BitcoinTalkCrawlerPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = \ ['timestamp', 'category_id', 'topic_id', 'topic_title', 'message_number', 'message_author', 'message_text'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class catalogscraperPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open("%s_items.csv" % spider.name, "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["title"] #'subject', 'description', 'creator', 'source', 'published', 'rights', 'citation', 'url'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def test_header_export_two_items(self): for item in [self.i, dict(self.i)]: output = BytesIO() ie = CsvItemExporter(output) ie.start_exporting() ie.export_item(item) ie.export_item(item) ie.finish_exporting() self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
class FashionnovaPipeline(object): def __init__(self): self.filename = 'fashionnova.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TsvPipeline(object): def __init__(self): self.files = dict() @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open(spider.name+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv*', 'wb') self.files[spider] = file self.exporter = CsvItemExporter(file, include_headers_line=True, join_multivalued=';', encoding="utf-8", delimiter='\t') if spider.name=='user': self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'joindate', 'activedate'] elif spider.name=='subject': self.exporter.fields_to_export = ['subjectid', 'order', 'subjectname', 'subjecttype', 'rank', 'date', 'votenum', 'favnum', 'staff', 'relations'] elif spider.name=='record': self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'iid', 'typ', 'state', 'adddate', 'rate', 'tags', 'comment'] elif spider.name=='index': self.exporter.fields_to_export = ['indexid', 'creator', 'favourite', 'date', 'items'] elif spider.name=='friends': self.exporter.fields_to_export = ['user', 'friend'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) filename = file.name newname = filename[:-5]+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv' file.close() os.rename(filename, newname) if UPLOAD_TO_AZURE_STORAGE: block_blob_service = BlockBlobService(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY) block_blob_service.create_blob_from_path(AZURE_CONTAINER, newname, newname, content_settings=ContentSettings(content_type='text/tab-separated-values') ) def process_item(self, item, spider): self.exporter.export_item(item) return item
class MacystopPipeline(object): def __init__(self): self.filename = 'topallproduct_all.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WriteItemPipeline(object): def __init__(self): self.filename = 'Glassdoor.csv' def open_spider(self, spider): self.csvfile = open(self.filename, "wb") self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class PyCsvPipeline: def open_spider(self, spider): self.file = open("/home/bladestone/lbb.csv", "wb") self.exporter = CsvItemExporter(self.file, fields_to_export=[ "schoolName", "currentBatch", "totalNumberInPlan" ]) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class RegistryScraperPipeline(object): def __init__(self): self.filename = 'registry_scraper/output/employment_site.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ReutersPipeline(object): def __init__(self): self.filename = 'reuters_news.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WriteItemPipeline(object): def __init__(self): self.filename = 'howlongtobeat_playtimes.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsvPipeline(object): def __init__(self): self.file = open('kaist_pulse.csv', 'wb') self.exporter = CsvItemExporter(self.file, encoding='utf-8') self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() df = pd.read_csv('kaist_pulse.csv') excel = pd.ExcelWriter('kaist_pulse.xlsx') df.to_excel(excel, index=False) excel.save() def process_item(self, item, spider): self.exporter.export_item(item) return item
class PlayerPipeline(object): def open_spider(self, spider): self.filename = open("player_info.csv", "wb") # 创建一个csv文件读写对象,参数是需要保存数据的csv文件对象 self.csv_exporter = CsvItemExporter(self.filename) # 表示开始进行数据写入 self.csv_exporter.start_exporting() def process_item(self, item, spider): if isinstance(item, PlayerItem): self.csv_exporter.export_item(item) return item def close_spider(self, spider): # 表示结束数据写入 self.csv_exporter.finish_exporting() self.filename.close()
class WriteItemPipeline(object): def __init__(self): self.filename = 'coliee.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): print('Process: {}'.format(item)) self.exporter.export_item(item) return item
class PdfCsvPipeline(CsvItemExporter): '''紀錄一共有幾筆pdf''' def __init__(self): self.fname = os.path.join(SAVE_PATH, "pdflist.csv") self.file = open(self.fname, "wb") self.exporter = CsvItemExporter(file=self.file, fields_to_export=["files", "file_urls"]) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): while True: global name_website name_website = input("Projekt Namen eingeben: ") break file = open('%s_%s.csv' % (spider.name, name_website), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["url", "status_code", "cache_control", "title", "title_length", "title_count", "description", "description_length", "description_count", "canonical", "canonical_self", "h1", "h1_count", "wordcount", "internal_links", "external_links", "amp_html", "amp_valid", "redirect_location", "referrer"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() #given I am using Windows i need to elimate the blank lines in the csv file print("Starting csv blank line cleaning") with open('%s_%s.csv' % (spider.name, name_website), 'r') as f: reader = csv.reader(f) original_list = list(reader) cleaned_list = list(filter(None,original_list)) with open('%s_%s_cleaned.csv' % (spider.name, name_website), 'w', newline='') as output_file: wr = csv.writer(output_file, dialect='excel') for data in cleaned_list: wr.writerow(data) def process_item(self, item, spider): self.exporter.export_item(item) return item
class ExpertsExtractPipeline(object): def __init__(self): self.files = {} self.file_name = 'CUNY Graduate Center.csv' self.export_fields = [ "name", "title_1", "title_2", "title_3", "department_1", "department_2", "department_3", "phone", "email", "website", "biography", "headshot", "faculty_page", "areas_of_expertise_1", "areas_of_expertise_2", "areas_of_expertise_3", "areas_of_expertise_4", "areas_of_expertise_5", "areas_of_expertise_6", "areas_of_expertise_7", "areas_of_expertise_8", "areas_of_expertise_9", "areas_of_expertise_10", "areas_of_expertise_11", "areas_of_expertise_12", "areas_of_expertise_13", "areas_of_expertise_14", "areas_of_expertise_15", "areas_of_expertise_16", "areas_of_expertise_17", "areas_of_expertise_18", "areas_of_expertise_19", "areas_of_expertise_20", "areas_of_expertise_21", "areas_of_expertise_22", "areas_of_expertise_23", "areas_of_expertise_24", "areas_of_expertise_25" ] @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): output_file = open(self.file_name, 'w+b') self.files[spider] = output_file self.exporter = CsvItemExporter(output_file, fields_to_export=self.export_fields) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() output_file = self.files.pop(spider) output_file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MySQLPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('output2.csv', 'w+b') self.exporter = CsvItemExporter(self.file, quoting=csv.QUOTE_ALL, lineterminator="\n") self.exporter.start_exporting() def spider_closed(self, spider): self.file.close() conn = pymysql.connect(host=MYSQL_HOST, db='scrapedb', user=MYSQL_USER, passwd=MYSQL_PWORD, charset='utf8', use_unicode=True, local_infile=True) cursor = conn.cursor() for tableName in [ 'walmart_latest_crawl', 'walmart_products_unique', 'walmart_products' ]: cursor.execute( """LOAD DATA LOCAL INFILE 'output2.csv' INTO TABLE %s FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\n' IGNORE 1 LINES (category,product_url,description,rating,img_url,brand,upc,seller,num_ratings, department,quantity,external_id,price,name);""" % tableName) conn.commit() conn.close() self.exporter.finish_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ExtractPipeline(object): def __init__(self): self.files = {} self.file_name = f'basspro_results_{time.time()}.csv' self.export_fields = [ 'ItemCode', 'Name', 'Reviews', 'Rating', 'Caliber', 'BulletWeight', 'BulletType', 'Units', 'Price', 'IsOnSale', 'RegularPrice', # 'Availability', 'Link', ] @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): output_file = open( self.file_name, 'w+b', ) self.files[spider] = output_file self.exporter = CsvItemExporter(output_file, fields_to_export=self.export_fields) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() output_file = self.files.pop(spider) output_file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CompletenessPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('/data/incomplete_%s_products.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): complete = True if (item['name'] is None) or (item['name'] == ''): complete = False if (item['brand'] is None) or (item['brand'] == ''): complete = False if (item['description'] is None) or (item['description'] == ''): complete = False if (item['url'] is None) or (item['url'] == ''): complete = False if (item['original_price'] is None) or (item['original_price'] == ''): complete = False if (item['price'] is None) or (item['price'] == ''): complete = False if (item['image_urls'] is None) or (item['image_urls'] == ''): complete = False if not complete: self.exporter.export_item(item) raise DropItem("Missing one or more element in %s" % item['url']) return item
class CsvExportPipeline(object): def __init__(self): self.file = None self.exporter = None def open_spider(self, spider): self.file = open('output.csv', 'w+b') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WebmdPipeline(object): # 418 lsw blocked class WriteItemPipeline(object): def __init__(self): self.filename = 'webmd_reviews.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WriteItemPipeline(object): def __init__(self): #t = datetime.datetime.now() #self.filename = 'nasdaq_headlines_' + t.strftime('%Y%m%d') + '.csv' self.filename = 'nasdaq_headlines.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class TheBeatlesChordsPipeline(object): def __init__(self): self.filename = 'the_beatles_chords.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.fields_to_export = ['name', 'chords'] self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsvPipeline(object): """ 写入有序的数据到CSV表格 """ def __init__(self): self.file = open('./result/crawl_result.csv', 'wb') self.exporter = CsvItemExporter(self.file, encoding='gbk') self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): item = OrderedDict(item) # 转换为有序的字典 item = json.dumps(item, ensure_ascii=False) # 转换为JSON格式 self.exporter.export_item(eval(item)) # 输出到CSV表格 return item
class FormScraperPipeline: def open_spider(self, spider): ext = tldextract.extract(spider.url) file_name = '.'.join( (ext.domain, ext.suffix, 'csv') ) # basically just removing the scheme so it doesnt mess with file paths print(file_name) self.file = open(file_name, 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def process_item(self, item, spider): print( "=================== process_item in pipeline =======================" ) prd_name = item['prd_name'] link = item['link'] comments = item['comments'] output = f'|{prd_name}|\t|{link}|\t|{comments}|\n\n' with open('./phone.txt', 'a+', encoding='utf-8') as article: article.write(output) with open("./phones.csv", "a+b") as f: exporter = CsvItemExporter(f, include_headers_line=False) exporter.start_exporting() exporter.export_item(item) exporter.finish_exporting() return item
class WriteItemPipeline( object ): #can copy and paste into my own script! However changes the filename. def __init__(self): self.filename = 'ebay11.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') #,/n = '' for windows. self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class RentPipeline(object): def __init__(self): self.filename = 'rent_info.csv' def open_spider(self, spider): self.csvfile = open( self.filename, 'ab') #open in append mode so it adds instead of overwrites self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class GooglenewsPipeline(object): def __init__(self): write_date = SearchInformation.str_from_date.replace(".", "") self.file = open("from" + write_date + "_" + "GoogleNews.csv", "wb") # ,newline="" self.exporter = CsvItemExporter(self.file, encoding='utf-8', delimiter="-") self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item '''
class CsvExporterPipeline(object): def __init__(self): self.files = {} def open_spider(self, spider): file = open('{}.csv'.format(spider.name), 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file, include_headers_line=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WriteItemPipeline(object): def __init__(self): self.filename = 'weatherThirtyYearsKLAX1986.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): #print "processing \n" self.exporter.export_item(item) return item
class Tc58CSVPipeline(object): def open_spider(self, spider): # 创建csv文件对象,拥有写权限 self.csv = open("gav.csv", "w") # 查创建一个Csv文件读写对象,参数是csv文件对象 self.csvexporter = CsvItemExporter(self.csv) # 指定读写权限,可以开始写入数据 self.csvexporter.start_exporting() def process_item(self, item, spider): # 将item数据写入到csv文件里 self.csvexporter.export_item(item) return item def close_spider(self, spider): # 表述数据写入结束 self.csvexporter.finish_exporting() self.csv.close()
class CentaPartPipeline(object): def __init__(self): self.filename = 'Centa_Part.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() f = read_csv(self.filename) f.to_csv(self.filename, index=False) def process_item(self, item, spider): self.exporter.export_item(item) return item
class WriteItemPipeline(object): def __init__(self): #target : 1(single), 2(albums) self.filename = 'acharts_singles.csv' #self.filename = 'acharts_albums.csv' def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.csvfile.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class WeiboPipeline(object): def __init__(self): print('begin') self.file = open("./fans_data.csv", "wb") self.exporter = CsvItemExporter(self.file, fields_to_export = ['fid', 'screen_name', 'profile_image_url', 'profile_url', 'followers_count', 'follow_count', 'desc1']) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item # @classmethod # def from_crawler(cls,crawler): # #从settings.py 里获取配置信息 # return cls( # host=crawler.settings.get('MYSQL_HOST'), # user=crawler.settings.get('MYSQL_USER'), # password=crawler.settings.get('MYSQL_PASSWORD'), # database=crawler.settings.get('MYSQL_DATABASE'), # port=crawler.settings.get('MYSQL_PORT') # ) # def open_spider(self,spider): # """ # 当Spider开启时,这个方法被调用 # :param spider: Spider 的实例 # :return: # """ # self.conn = pymysql.connect( # host =self.host, # user=self.user, # password=self.password, # database=self.database, # port=self.port, # charset='utf8' # ) # self.cursor = self.conn.cursor() def close_spider(self, spider): print('done') self.exporter.finish_exporting() self.file.close()
class AmsemailbotPipeline(object): def __init__(self): if os.path.exists('crawled_emails.csv'): os.remove('crawled_emails.csv') self.file = open("crawled_emails.csv", 'wb') self.exporter = CsvItemExporter(self.file, include_headers_line=True) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() # @classmethod # def from_crawler(cls, crawler): # pipeline = cls() # crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) # crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # return pipeline # def spider_opened(self, spider): # if os.path.exists('crawled_emails.csv'): # os.remove('crawled_emails') # self.file = open('crawled_emails.csv', 'w+b') # self.exporter = CsvItemExporter(self.file) # self.exporter.start_exporting() # def spider_closed(self, spider): # self.exporter.finish_exporting() # self.file.close() def process_item(self, item, spider): if item.get('title'): if item.get('author'): if item.get('email'): self.file = open('crawled_emails.csv', 'w+b') self.exporter.export_item(item) return item else: raise DropItem('Missing email') else: raise DropItem('Missing author') else: raise DropItem('Missing Title') return item
class AqiCsvPipeline(object): def open_spider(self, spider): # 保存csv数据的文件对象 self.f = open('aqi.csv', 'w') # 创建csv文件读写对象 self.csv_exporter = CsvItemExporter(self.f) # 开始进行csv文件读写 self.csv_exporter.start_exporting() def process_item(self, item, spider): # 每次写入一个item数据 self.csv_exporter.export_item(item) return item def close_spider(self, spider): # 结束csv文件读写 self.csv_exporter.finish_exporting() # 关闭文件 self.f.close()
class CsvExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_jobs.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CrawlerPipeline(object): EXPORT_PATH = os.getenv("HOME") def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): path = CrawlerPipeline.EXPORT_PATH + "/" + spider.spider_id + "_export.csv" export_file = open(path, "ab" if os.path.isfile(path) else "wb") self.files[spider.spider_id] = export_file self.exporter = CsvItemExporter(export_file) self.exporter.fields_to_export = [ "item_id", "url", "num_links", "num_images", "num_scripts", "num_styles", "headers", "text", ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() export_file = self.files.pop(spider.spider_id) export_file.close() def process_item(self, item, spider): # This is a common path among ALL crawlers self.exporter.export_item(item) return item
class CSVExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # print( str(spider) ) # pdb.set_trace() file = open('%s' % spider.nameOfFile, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) #each spider has a different items if ( spider.name is 'fleetintel_list' ): self.exporter.fields_to_export = ['Company', 'Model', 'MSN', 'YoM', 'Reg', 'Comments'] elif ( spider.name is 'Available_assets' ): self.exporter.fields_to_export = ['Category', 'Company', 'Contact_webPage', 'Contact_email', 'Contact_phone', 'Model', 'YoM', 'MSN', 'TFHs_TFCs', 'Engines', 'F_B_E', 'OL_A_S', 'LU', 'AD', 'ESN', 'L_E_S'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class BigMLPipeline(BigMLAPIMixIn): AUTH_ERRMSG = ( "{errtype:s} BigML credentials. Please supply BIGML_USERNAME" " and BIGML_API_KEY as either Scrapy settings or environment" " variables." ) def __init__(self, username=None, api_key=None, source_name=None, dev_mode=None): self.source_name = source_name self.get_bigml_api(username, api_key, dev_mode=dev_mode) @classmethod def from_crawler(cls, crawler): o = cls( username=crawler.settings["BIGML_USERNAME"], api_key=crawler.settings["BIGML_API_KEY"], source_name=crawler.settings.get("BIGML_SOURCE_NAME", "Scrapy"), dev_mode=crawler.settings.getbool("BIGML_DEVMODE", False), ) o.crawler = crawler o.settings = crawler.settings return o def open_spider(self, spider): self.tempfile = TemporaryFile(prefix="bigml-feed-") self.exporter = CsvItemExporter(self.tempfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.tempfile.seek(0) self.export_to_bigml(self.tempfile, self.source_name) def process_item(self, item, spider): self.exporter.export_item(item) return item
class HarvestmanPipeline(object): """Pipepline definition for spiders in the harvestman_spider project""" def __init__(self): """__init__, innit.""" self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): csv_file = settings.CSV_FILE_OUTPUT_DIR.format( spider.base_url.split('/')[2], datetime.date.today().strftime('%Y-%m-%d')) if spider.name == 'google_serp_spider': file = open(csv_file, 'w') self.files[spider] = file # note this outputs as a tab seperated csv, rather than comma. self.exporter = CsvItemExporter(file, delimiter='\t') self.exporter.start_exporting() def spider_closed(self, spider): if spider.name == 'google_serp_spider': self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if spider.name == 'google_serp_spider': self.exporter.export_item(item) return item
class CSVPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): if spider.name in 'realestate': self.file = open('current_listing.csv', 'w+b') else: self.file = open('past_listing.csv', 'w+b') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsvExportPipeline(object): spiders_to_processors = None def __init__(self): self.files = {} self.exporter = None self.spiders_to_processors = { teams.TeamsSpider.__name__: TeamProcessor, team_season.TeamSeasonSpider.__name__: TeamSeasonProcessor, players.PlayersSpider.__name__: PlayerProcessor, player_season.PlayerSeasonSpider.__name__: PlayerSeasonProcessor, } @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): """ called when the spider is started """ try: processor = self.spiders_to_processors[type(spider).__name__]() except KeyError: self.exporter = None return file = open(processor.get_storage_filepath(spider), "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): """ called when the spider is finished crawling """ if self.exporter: self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): """ called every time an item is yielded from a spider """ if self.exporter: self.exporter.export_item(item) return item
class CsvExportPipeline(object): fields_to_export = [ 'Title', 'Author', 'AuthorLifetime', 'TotalLength', 'Language', 'Genre', 'Readers', 'NumberOfReaders', 'WikipediaLink', 'AuthorWikipediaLink', 'CatalogedOnDate', 'DescriptionText', 'LibrivoxUrlOfTitle', 'LinksToAll128kMp3Files', 'HasCoverArt', 'HasCdInsertArt' ] @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): FILES_STORE = settings.FILES_STORE self.file = open(FILES_STORE + 'Librivox-Book-List.csv', 'w+b') self.exporter = CsvItemExporter(self.file) self.exporter.fields_to_export = self.fields_to_export self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) FILES_STORE = settings.FILES_STORE invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*'] title_dir = item['Title'] for each_char in invalid_chars: title_dir = title_dir.replace(each_char, '-') if not os.path.exists(FILES_STORE + title_dir): os.makedirs(FILES_STORE + title_dir) # write txt files for each_file in self.fields_to_export: txt_file = FILES_STORE + title_dir + '/' + each_file + '.txt' with open(txt_file, 'w') as outfile: outfile.write(item[each_file]) return item def convert_csv_to_excel(self, csv_file, excel_file): workbook = Workbook(excel_file) worksheet = workbook.add_worksheet() with open(csv_file, 'rb') as f: reader = csv.reader(f) for r, row in enumerate(reader): for c, col in enumerate(row): worksheet.write(r, c, col) workbook.close()