Exemplo n.º 1
0
    def run(self):

        values = configdata.get(DetailSpiderConst.DetailStatusSettings, {})
        values[const.DETAIL_LIST] = self.cis
        values.update(**{
                  const.CONFIG_DATA:self.configdata,
                  })
        
        if ScrapyConst.Console in values:
            if values[ScrapyConst.Console] == u'1':# out to console
                values[ScrapyConst.LOG_FILE] = None
            else:
                log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd())
                
                if not os.path.exists(log_dir):
                    os.makedirs(log_dir,)
                
                if ScrapyConst.LOG_FILE in values:
                    log_file = values[ScrapyConst.LOG_FILE]
                    values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file])
                    
        settings_path = u'crawler.shc.fe.settings'
        module_import = __import__(settings_path, {}, {}, [''])
        settings = CrawlerSettings(module_import, values=values)
        execute(argv=["scrapy", "crawl", 'CarStatusSpider' ], settings=settings)
Exemplo n.º 2
0
def fetch51anonymousfreeproxy():
    
    values = configdata.get(FetchProxySpiderConst.FetchFOAnonymousProxySettings, {})
    values[ScrapyConst.DOWNLOAD_TIMEOUT] = int(values.get(ScrapyConst.DOWNLOAD_TIMEOUT, 0))
    if ScrapyConst.Console in values:
        if values[ScrapyConst.Console] == u'1':# out to console
            values[ScrapyConst.LOG_FILE] = None
        else:
            log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd())
            if ScrapyConst.LOG_FILE in values:
                log_file = values[ScrapyConst.LOG_FILE]
                values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file])
                
    settings = CrawlerSettings(None, values=values)
    execute(argv=["scrapy", "crawl", "FOAnonymousSpider" ], settings=settings)
Exemplo n.º 3
0
    def run(self):
        feconfig = self.configdata[const.FE_CONFIG]
        try:
        #=======================================================================
        # if the city use the default config
        #=======================================================================
            city_config = eval(feconfig[self.city_name])
        except Exception:
            city_config = {}
        
        start_page = city_config.get(const.START_PAGE,
                             feconfig[const.DEFAULT_START_PAGE])
        end_page = city_config.get(const.END_PAGE,
                                   feconfig[const.DEFAULT_END_PAGE])
#        values = {
#                  const.CONFIG_DATA:self.configdata,
#                  const.START_PAGE:int(start_page),
#                  const.END_PAGE:int(end_page),
#                  }
#        settings = u'crawler.shc.fe.settings'
#        module_import = __import__(settings, {}, {}, [''])
#        settings = CrawlerSettings(module_import, values=values)
#        execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings)

        values = configdata.get(ListSpiderConst.ListSettings, {})
        
        values.update(**{
                  const.CONFIG_DATA:self.configdata,
                  const.START_PAGE:int(start_page),
                  const.END_PAGE:int(end_page),
                  })
        
        if ScrapyConst.Console in values:
            if values[ScrapyConst.Console] == u'1':# out to console
                values[ScrapyConst.LOG_FILE] = None
            else:
                log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd())
                if ScrapyConst.LOG_FILE in values:
                    log_file = values[ScrapyConst.LOG_FILE]
                    values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file])
                    
        settings_path = u'crawler.shc.fe.settings'
        module_import = __import__(settings_path, {}, {}, [''])
        settings = CrawlerSettings(module_import, values=values)
        execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings)
Exemplo n.º 4
0
 def run(self):
     values = configdata.get(const.vpsettings, {})
     values[AppConst.proxies] = self.proxies
     values[const.DOWNLOAD_TIMEOUT] = int(values.get(const.DOWNLOAD_TIMEOUT, 5))
     if const.Console in values:
         if values[const.Console] == u'1':# out to console
             values[const.LOG_FILE] = None
         else:
             log_dir = values.get(const.LOG_DIR, os.getcwd())
             if const.LOG_FILE in values:
                 logfile_prefix = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
                 log_file = '%s_%s' % (logfile_prefix, values[const.LOG_FILE])
                 values[const.LOG_FILE] = os.sep.join([log_dir , log_file])
                 
     values[const.RETRY_TIMES] = len(valid_urls)
     settings = u'vp.settings'
     module_import = __import__(settings, {}, {}, [''])
     
     settings = CrawlerSettings(module_import, values=values)
     execute(argv=["scrapy", "crawl", 'SOSOSpider' ], settings=settings)
Exemplo n.º 5
0
    def run(self):
        if self.proxies:
            values = configdata.get(const.vpsettings, {})
            values[AppConst.proxies] = self.proxies
            values[const.DOWNLOAD_TIMEOUT] = int(
                values.get(const.DOWNLOAD_TIMEOUT, 5))
            if const.Console in values:
                if values[const.Console] == u'1':  # out to console
                    values[const.LOG_FILE] = None
                else:
                    log_dir = values.get(const.LOG_DIR, os.getcwd())
                    if const.LOG_FILE in values:
                        logfile_prefix = datetime.datetime.now().strftime(
                            "%Y%m%d_%H%M%S_%f")
                        log_file = '%s_%s' % (logfile_prefix,
                                              values[const.LOG_FILE])
                        values[const.LOG_FILE] = os.sep.join(
                            [log_dir, log_file])

            settings = CrawlerSettings(None, values=values)
            execute(argv=["scrapy", "crawl", 'SOSOSpider'], settings=settings)
Exemplo n.º 6
0
def run():
    appconfig = configdata.get(AppConst.app_config, {})
    frequence = appconfig.get(AppConst.app_config_frequence, 1800)
    frequence = int(frequence)
    volume_per_time = appconfig.get(AppConst.volumepertime, 1000)
    volume_per_time = int(volume_per_time)
    ps = []
    while 1:
        proxy_ids = []
        proxies = get_proxies(d=datetime.date.today())
        print u'get %s proxies' % len(proxies)
        for idx, proxy in enumerate(proxies):
            proxy_ids.append(proxy)
            if len(proxy_ids) == volume_per_time:
                p = ValidProcess(proxy_ids)
                ps.append(p)
                print u'%s %s start %s' % (datetime.datetime.now(), p.name,
                                           len(proxy_ids))
                p.start()
                proxy_ids = []
        else:
            if proxy_ids:
                p = ValidProcess(proxy_ids)
                ps.append(p)
                print u'%s %s start %s' % (datetime.datetime.now(), p.name,
                                           len(proxy_ids))
                p.start()
                proxy_ids = []

        print u'%s valid proxy .. sleep %s seconds' % (datetime.datetime.now(),
                                                       frequence)
        time.sleep(frequence)
        while ps:
            p = ps.pop()
            try:
                p.terminate()
                print(u'%s terminate one process %s' %
                      (datetime.datetime.now(), p.name))
            except:
                pass
Exemplo n.º 7
0
def run():
    appconfig = configdata.get(AppConst.app_config, {})
    frequence = appconfig.get(AppConst.app_config_frequence, 1800)
    frequence = int(frequence)
    volume_per_time = appconfig.get(AppConst.volumepertime, 1000)
    volume_per_time = int(volume_per_time)
    ps = []
    while 1:
        proxy_ids = []
        proxies = get_proxies(d=datetime.date.today())
        print u'get %s proxies'%len(proxies)
        for idx, proxy in enumerate(proxies):
            proxy_ids.append(proxy)
            if len(proxy_ids) == volume_per_time:
                p = ValidProcess(proxy_ids)
                ps.append(p)
                print u'%s %s start %s' % (datetime.datetime.now(), p.name,len(proxy_ids))
                p.start()
                proxy_ids = []
        else:
            if proxy_ids:
                p = ValidProcess(proxy_ids)
                ps.append(p)
                print u'%s %s start %s' % (datetime.datetime.now(), p.name,len(proxy_ids))
                p.start()
                proxy_ids = []
                
        print u'%s valid proxy .. sleep %s seconds' % (datetime.datetime.now(),
                                                       frequence)
        time.sleep(frequence)
        while ps:
            p = ps.pop()
            try:
                p.terminate()
                print (u'%s terminate one process %s' % (datetime.datetime.now(),
                                                         p.name))
            except :
                pass
Exemplo n.º 8
0
def fetch51freeproxy():
    values = configdata.get(const.vpsettings, {})
    settings = CrawlerSettings(values=values)
    execute(argv=["scrapy", "crawl", "FOSpider"], settings=settings)