示例#1
0
 def spider_opened(self, spider):
     export_dir = settings.get('EXPORT_DIR', '.')
     t = time.strftime('%Y-%m-%d %H-%M-%S GMT+6', time.gmtime(time.time() + 6*3600))
     path = os.path.join(export_dir, '%s.csv' % t)
     self.file = open(path, 'w+b')
     self.exporter = CSVRealstateItemExporter(self.file)
     self.exporter.start_exporting()
示例#2
0
class RealstateMonthlyPipeline(object):
    def process_item(self, item, spider):
        return item

    EXPORT_PATH = os.getenv("HOME")

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    # @spider_opened_working
    def spider_opened(self, spider):
        export_dir = settings.get("EXPORT_PATH", ".")
        t = time.strftime("%Y-%m-%d %H-%M-%S GMT+6", time.gmtime(time.time() + 6 * 3600))
        path = os.path.join(export_dir, "%s.csv" % t)
        self.files = open(path, "w+b")
        self.exporter = CSVRealstateItemExporter(self.files)
        self.exporter.start_exporting()

    # End}}}
    # def spider_opened(self, spider):
    #     # self.file = open('%s-%s.csv' % (spider.name, time.strftime("%Y-%m-%d-%H")), 'w+')
    #     path = RealstateMonthlyPipeline.EXPORT_PATH + "/" + spider.name + '_export.csv'
    #     export_file = open(path, 'ab' if os.path.isfile(path) else 'wb')
    #     self.files[spider.name] = export_file
    #     # self.exporter = CsvRealstateItemExporter(self.file)
    #     self.exporter = CSVRealstateItemExporter(self.files)
    #         self.exporter.fields_to_export = ['links', 'title', 'subur_name', 'unit_mly_jan', 'unit_mly_feb', 'unit_mly_mar', 'unit_mly_apr', 'unit_mly_may',
    # 'unit_mly_jun', 'unit_mly_jul', 'unit_mly_aug', 'unit_mly_sep', 'unit_mly_oct', 'unit_mly_nov',
    # 'unit_mly_dec', 'unit_mly_p_jan', 'unit_mly_p_feb', 'unit_mly_p_mar',
    # 'unit_mly_p_apr', 'unit_mly_p_may', 'unit_mly_p_jun', 'unit_mly_p_jul', 'unit_mly_p_aug', 'unit_mly_p_sep',
    # 'unit_mly_p_oct', 'unit_mly_p_nov', 'unit_mly_p_dec', 'unit_mly_nos_jan',
    # 'unit_mly_nos_feb', 'unit_mly_nos_mar', 'unit_mly_nos_apr', 'unit_mly_nos_may', 'unit_mly_nos_jun',
    # 'unit_mly_nos_jul', 'unit_mly_nos_aug', 'unit_mly_nos_sep', 'unit_mly_nos_oct', 'unit_mly_nos_nov', 'unit_mly_nos_dec',
    # ]
    # self.exporter.start_exporting()

    # def spider_opened(self, spider):
    #     if spider.name in 'realestate':
    #         self.file = open('current_listing.csv', 'w+b')
    #     else:
    #         self.file = open('past_listing.csv', 'w+b')
    #     self.exporter = CsvItemExporter(self.file)
    #     self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.files.close()
示例#3
0
class RealstateMonthlyPipeline(object):
    def process_item(self, item, spider):
        return item
    EXPORT_PATH = os.getenv("HOME")

    def __init__(self):
        self.files = {}
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
# @spider_opened_working
    def spider_opened(self, spider):
        export_dir = settings.get('EXPORT_PATH', '.')
        t = time.strftime('%Y-%m-%d %H-%M-%S GMT+6', time.gmtime(time.time() + 6*3600))
        path = os.path.join(export_dir, '%s.csv' % t)
        self.files = open(path, 'w+b')
        self.exporter = CSVRealstateItemExporter(self.files)
        self.exporter.start_exporting()
# End}}}
    # def spider_opened(self, spider):
    #     # self.file = open('%s-%s.csv' % (spider.name, time.strftime("%Y-%m-%d-%H")), 'w+')
    #     path = RealstateMonthlyPipeline.EXPORT_PATH + "/" + spider.name + '_export.csv'
    #     export_file = open(path, 'ab' if os.path.isfile(path) else 'wb')
    #     self.files[spider.name] = export_file
    #     # self.exporter = CsvRealstateItemExporter(self.file)
    #     self.exporter = CSVRealstateItemExporter(self.files)
#         self.exporter.fields_to_export = ['links', 'title', 'subur_name', 'unit_mly_jan', 'unit_mly_feb', 'unit_mly_mar', 'unit_mly_apr', 'unit_mly_may', 
# 'unit_mly_jun', 'unit_mly_jul', 'unit_mly_aug', 'unit_mly_sep', 'unit_mly_oct', 'unit_mly_nov', 
# 'unit_mly_dec', 'unit_mly_p_jan', 'unit_mly_p_feb', 'unit_mly_p_mar', 
# 'unit_mly_p_apr', 'unit_mly_p_may', 'unit_mly_p_jun', 'unit_mly_p_jul', 'unit_mly_p_aug', 'unit_mly_p_sep', 
# 'unit_mly_p_oct', 'unit_mly_p_nov', 'unit_mly_p_dec', 'unit_mly_nos_jan', 
# 'unit_mly_nos_feb', 'unit_mly_nos_mar', 'unit_mly_nos_apr', 'unit_mly_nos_may', 'unit_mly_nos_jun', 
# 'unit_mly_nos_jul', 'unit_mly_nos_aug', 'unit_mly_nos_sep', 'unit_mly_nos_oct', 'unit_mly_nos_nov', 'unit_mly_nos_dec',
# ]
        # self.exporter.start_exporting()

    # def spider_opened(self, spider):
    #     if spider.name in 'realestate':
    #         self.file = open('current_listing.csv', 'w+b')
    #     else:
    #         self.file = open('past_listing.csv', 'w+b')
    #     self.exporter = CsvItemExporter(self.file)
    #     self.exporter.start_exporting()


    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.files.close()
示例#4
0
class RealstateMonthlyPipeline(object):
    def process_item(self, item, spider):
        return item


# class CafelandPipeline(object):
#     def process_item(self, item, spider):
#         for field in ['name','updated_time','particulars']:
#             item[field] = [val.strip(' \t\n\r') for val in item[field]]
#             item[field] = [val for val in item[field] if val]

#         pars = item['particulars']
#         for i in range(0, len(pars), 2):
#             if i+1 < len(pars):
#                 new_field, new_val = (unidecode(pars[i]).strip(' \t\n\r'), pars[i+1].strip(' \t\n\r:'))
#             else:
#                 new_field, new_val = (unidecode(pars[i]).strip(' \t\n\r'), '')
#             item.fields[new_field] = Field()
#             item[new_field] = new_val

#             if i+1 < len(pars):
#                 pars[i] = pars[i] + pars[i+1]
#                 pars[i+1] = ''
#         item['particulars'] = [p for p in pars if p]

#         item['updated_time'] = [val[11:] for val in item['updated_time']]

#         self.exporter.export_item(item)
#         return item

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        export_dir = settings.get('EXPORT_DIR', '.')
        t = time.strftime('%Y-%m-%d %H-%M-%S GMT+6',
                          time.gmtime(time.time() + 6 * 3600))
        path = os.path.join(export_dir, '%s.csv' % t)
        self.file = open(path, 'w+b')
        self.exporter = CSVRealstateItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
示例#5
0
 def spider_opened(self, spider):
     export_dir = settings.get("EXPORT_PATH", ".")
     t = time.strftime("%Y-%m-%d %H-%M-%S GMT+6", time.gmtime(time.time() + 6 * 3600))
     path = os.path.join(export_dir, "%s.csv" % t)
     self.files = open(path, "w+b")
     self.exporter = CSVRealstateItemExporter(self.files)
     self.exporter.start_exporting()
class RealstateMonthlyPipeline(object):
    def process_item(self, item, spider):
        return item
# class CafelandPipeline(object):
#     def process_item(self, item, spider):
#         for field in ['name','updated_time','particulars']:
#             item[field] = [val.strip(' \t\n\r') for val in item[field]]
#             item[field] = [val for val in item[field] if val]

#         pars = item['particulars']
#         for i in range(0, len(pars), 2):
#             if i+1 < len(pars):
#                 new_field, new_val = (unidecode(pars[i]).strip(' \t\n\r'), pars[i+1].strip(' \t\n\r:'))               
#             else:
#                 new_field, new_val = (unidecode(pars[i]).strip(' \t\n\r'), '')
#             item.fields[new_field] = Field()
#             item[new_field] = new_val

#             if i+1 < len(pars):
#                 pars[i] = pars[i] + pars[i+1]
#                 pars[i+1] = ''
#         item['particulars'] = [p for p in pars if p]

#         item['updated_time'] = [val[11:] for val in item['updated_time']]

#         self.exporter.export_item(item)
#         return item

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        export_dir = settings.get('EXPORT_DIR', '.')
        t = time.strftime('%Y-%m-%d %H-%M-%S GMT+6', time.gmtime(time.time() + 6*3600))
        path = os.path.join(export_dir, '%s.csv' % t)
        self.file = open(path, 'w+b')
        self.exporter = CSVRealstateItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
示例#7
0
class RealstateMonthlyPipeline(object):
    def process_item(self, item, spider):
        return item
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        export_dir = settings.get('EXPORT_DIR', '.')
        t = time.strftime('%Y-%m-%d %H-%M-%S GMT+6', time.gmtime(time.time() + 6*3600))
        path = os.path.join(export_dir, '%s.csv' % t)
        self.file = open(path, 'w+b')
        self.exporter = CSVRealstateItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
    def spider_opened(self, spider):
        export_dir = settings.get('EXPORT_PATH', '.')
        t = time.strftime('%Y-%m-%d %H-%M-%S GMT+6', time.gmtime(time.time() + 6*3600))
        path = os.path.join(export_dir, '%s.csv' % t)
        self.files = open(path, 'w+b')
        self.exporter = CSVRealstateItemExporter(self.files)
        self.exporter.start_exporting()
# End}}}
    # def spider_opened(self, spider):
    #     # self.file = open('%s-%s.csv' % (spider.name, time.strftime("%Y-%m-%d-%H")), 'w+')
    #     path = RealstateMonthlyPipeline.EXPORT_PATH + "/" + spider.name + '_export.csv'
    #     export_file = open(path, 'ab' if os.path.isfile(path) else 'wb')
    #     self.files[spider.name] = export_file
    #     # self.exporter = CsvRealstateItemExporter(self.file)
    #     self.exporter = CSVRealstateItemExporter(self.files)
#         self.exporter.fields_to_export = ['links', 'title', 'subur_name', 'unit_mly_jan', 'unit_mly_feb', 'unit_mly_mar', 'unit_mly_apr', 'unit_mly_may', 
# 'unit_mly_jun', 'unit_mly_jul', 'unit_mly_aug', 'unit_mly_sep', 'unit_mly_oct', 'unit_mly_nov', 
# 'unit_mly_dec', 'unit_mly_p_jan', 'unit_mly_p_feb', 'unit_mly_p_mar', 
# 'unit_mly_p_apr', 'unit_mly_p_may', 'unit_mly_p_jun', 'unit_mly_p_jul', 'unit_mly_p_aug', 'unit_mly_p_sep', 
# 'unit_mly_p_oct', 'unit_mly_p_nov', 'unit_mly_p_dec', 'unit_mly_nos_jan', 
# 'unit_mly_nos_feb', 'unit_mly_nos_mar', 'unit_mly_nos_apr', 'unit_mly_nos_may', 'unit_mly_nos_jun', 
# 'unit_mly_nos_jul', 'unit_mly_nos_aug', 'unit_mly_nos_sep', 'unit_mly_nos_oct', 'unit_mly_nos_nov', 'unit_mly_nos_dec',
# ]
        self.exporter.start_exporting()
class RealstateMonthlyPipeline(object):
    def process_item(self, item, spider):
        return item
# class CafelandPipeline(object):
#     def process_item(self, item, spider):
#         for field in ['name','updated_time','particulars']:
#             item[field] = [val.strip(' \t\n\r') for val in item[field]]
#             item[field] = [val for val in item[field] if val]

#         pars = item['particulars']
#         for i in range(0, len(pars), 2):
#             if i+1 < len(pars):
#                 new_field, new_val = (unidecode(pars[i]).strip(' \t\n\r'), pars[i+1].strip(' \t\n\r:'))               
#             else:
#                 new_field, new_val = (unidecode(pars[i]).strip(' \t\n\r'), '')
#             item.fields[new_field] = Field()
#             item[new_field] = new_val

#             if i+1 < len(pars):
#                 pars[i] = pars[i] + pars[i+1]
#                 pars[i+1] = ''
#         item['particulars'] = [p for p in pars if p]

#         item['updated_time'] = [val[11:] for val in item['updated_time']]

#         self.exporter.export_item(item)
#         return item
    EXPORT_PATH = os.getenv("HOME")

    def __init__(self):
        self.files = {}
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
# @spider_opened_working
    def spider_opened(self, spider):
        export_dir = settings.get('EXPORT_PATH', '.')
        t = time.strftime('%Y-%m-%d %H-%M-%S GMT+6', time.gmtime(time.time() + 6*3600))
        path = os.path.join(export_dir, '%s.csv' % t)
        self.files = open(path, 'w+b')
        self.exporter = CSVRealstateItemExporter(self.files)
        self.exporter.start_exporting()
# End}}}
    # def spider_opened(self, spider):
    #     # self.file = open('%s-%s.csv' % (spider.name, time.strftime("%Y-%m-%d-%H")), 'w+')
    #     path = RealstateMonthlyPipeline.EXPORT_PATH + "/" + spider.name + '_export.csv'
    #     export_file = open(path, 'ab' if os.path.isfile(path) else 'wb')
    #     self.files[spider.name] = export_file
    #     # self.exporter = CsvRealstateItemExporter(self.file)
    #     self.exporter = CSVRealstateItemExporter(self.files)
#         self.exporter.fields_to_export = ['links', 'title', 'subur_name', 'unit_mly_jan', 'unit_mly_feb', 'unit_mly_mar', 'unit_mly_apr', 'unit_mly_may', 
# 'unit_mly_jun', 'unit_mly_jul', 'unit_mly_aug', 'unit_mly_sep', 'unit_mly_oct', 'unit_mly_nov', 
# 'unit_mly_dec', 'unit_mly_p_jan', 'unit_mly_p_feb', 'unit_mly_p_mar', 
# 'unit_mly_p_apr', 'unit_mly_p_may', 'unit_mly_p_jun', 'unit_mly_p_jul', 'unit_mly_p_aug', 'unit_mly_p_sep', 
# 'unit_mly_p_oct', 'unit_mly_p_nov', 'unit_mly_p_dec', 'unit_mly_nos_jan', 
# 'unit_mly_nos_feb', 'unit_mly_nos_mar', 'unit_mly_nos_apr', 'unit_mly_nos_may', 'unit_mly_nos_jun', 
# 'unit_mly_nos_jul', 'unit_mly_nos_aug', 'unit_mly_nos_sep', 'unit_mly_nos_oct', 'unit_mly_nos_nov', 'unit_mly_nos_dec',
# ]
        self.exporter.start_exporting()

    # def spider_opened(self, spider):
    #     if spider.name in 'realestate':
    #         self.file = open('current_listing.csv', 'w+b')
    #     else:
    #         self.file = open('past_listing.csv', 'w+b')
    #     self.exporter = CsvItemExporter(self.file)
    #     self.exporter.start_exporting()


    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.files.close()