def proxy_ip_pool(self): """ 迅联错误码10000 提取过快,请至少5秒提取一次 """ if "DRAGONFLY" == self.proxy_agent: return CommonClass.get_proxies(proxy_dict={}) now = time.time() need_new_proxy = False if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict): need_new_proxy = True elif "expire" not in self.proxy_ip_dict.keys(): need_new_proxy = True elif now + 3 > self.proxy_ip_dict["expire"]: need_new_proxy = True if need_new_proxy: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 1 > len(proxies_dict): return self.proxy_ip_dict # still return the old ip dict or {} proxies_dict["expire"] = now + random.randint( self.min_proxy_ip_life_time, self.max_proxy_ip_life_time) # set ip life time self.proxy_ip_dict = proxies_dict return self.proxy_ip_dict
def process_request(self, request, spider): if self.proxy_meta is None or not isinstance( self.proxy_meta, dict) or 1 > len(self.proxy_meta): self.proxy_meta = CommonClass.get_proxies(proxy_dict={}) if request.url.startswith("http://"): request.meta['proxy'] = self.proxy_meta['http'] elif request.url.startswith("https://"): request.meta['proxy'] = self.proxy_meta['https']
def start_requests(self): self.init_self_attributes() self.make_dirs() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request( url = url, callback = self.read_and_parse ) elif "PRODUCTION_RUN" == self.run_purpose: urls = [ # "http://www.cnemc.cn/sssj/", # 中国环境监测总局,实时数据页面 self.base_url, ] meta_dict = {} if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len( proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] formdata_dict = {} # 没有任何表单字段需要post给目标网站 for url in urls: # yield scrapy.RequestForm( url = url, callback = self.parse_json, meta = meta_dict, dont_filter = True ) # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True ) self.last_request_time = time.time() yield scrapy.FormRequest( url = url, formdata = formdata_dict, callback = self.parse_json, meta = meta_dict, dont_filter = True ) elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update( token.encode(encoding = 'utf-8') ) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies( proxy_dict = {} ) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict(headers = {}, params_for_proxy_ip={}, setup_xunlian_dict = {}, need_setup_xunlian = False, logger=self.logger ) if 0 < len( proxies_dict): meta_dict = { "proxy": proxies_dict["http"] } for url in urls: yield scrapy.Request( url=url, callback=self.do_nothing_for_debug, meta = meta_dict ) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}" ) else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request( url=url, callback=self.do_nothing_for_debug )
def init_self_attributes(self): self.run_purpose = self.settings.get(name='RUN_PURPOSE', default=None) # set all paths self.root_path = self.settings.get('PROJECT_PATH') self.crawled_folder_name = self.settings.get(name='CRAWLED_DIR', default='crawled') self.detail_html_folder_name = self.settings.get( name='SAVED_DETAIL_HTML', default='detail_html') self.list_html_folder_name = self.settings.get(name='SAVED_LIST_HTML', default='list_html') self.svg_text_css_folder_name = self.settings.get(name='SVG_TEXT_CSS', default='svgtextcss') if self.run_purpose in [ "PARSE_FIDDLER", "PARSE_DETAILED_HOTEL", ]: self.detail_html_folder_name = f"{ self.detail_html_folder_name }_fiddler" self.list_html_folder_name = f"{ self.list_html_folder_name }_fiddler" self.svg_text_css_folder_name = f"{ self.svg_text_css_folder_name }_fiddler" # whether this run is for debugging self.debug = self.settings.get(name='PROJECT_DEBUG', default=False) self.move_fiddler_file = self.settings.get( name='MOVE_FIDDLER_HTML_FILE', default=True) # get proxy header temp = CommonClass.get_proxies(proxy_dict={}) self.proxy_meta = temp['http'] self.database_city_district_table = self.settings.get( name='DATABASE_CITY_DISTRICT_TABLE', default={}) self.database_level2name_table = self.settings.get( name='DATABASE_LEVEL2NAME_TABLE', default={}) self.database_merchant_star_level_table = self.settings.get( name='DATABASE_MERCHANT_STAR_LEVEL_TABLE', default={}) self.database_anticrawl20190505_table = self.settings.get( name='DATABASE_ANTICRAWL20190505_TABLE', default={}) self.database_common_channel_list_table = self.settings.get( name='DATABASE_COMMON_CHANNEL_LIST_TABLE', default=[])
def __init__(self, root_path="", css_file="", css_string = "", send_requests=False, referer=None, save_requested_svg=True, csv_file=None, settings = None, folder="", logger=None): # read fiddler self.settings = None if settings is None else settings temp = self.settings.get( name = "RUN_PURPOSE", default=None ) self.read_fiddler = False if "PARSE_FIDDLER" == temp: self.read_fiddler = True self.root_path = os.getcwd() if root_path is None or 1 > len( root_path) else root_path self.folder = "list_html" if folder is None or 1 > len( folder ) else folder self.spider_name = self.settings.get( "SPIDER_NAME" ) if self.settings is not None else "" self.svg_css_folder_name = self.settings.get( "SVG_TEXT_CSS" ) if self.settings is not None else "" if self.read_fiddler: self.svg_css_folder_name = f"{ self.svg_css_folder_name }_fiddler" self.css_file = "" if css_file is None or 1 > len( css_file) else css_file if self.css_file is not None and 0 < len( self.css_file ): self.css_file_path = os.path.join( self.root_path, self.spider_name, self.svg_css_folder_name, self.css_file ) self.css_string = "" if css_string is None or 1 > len( css_string ) else css_string self.send_requests = False if send_requests is None else send_requests self.referer = None if referer is None or 1 > len( referer ) else referer self.save_requested_svg = True if save_requested_svg is None else save_requested_svg self.csv_file = "" if csv_file is None or 1 > len( csv_file ) else csv_file self.logger = None if logger is None else logger if self.logger is None: print( f"please pass the logger!" ) sys.exit(2) self.use_proxy = True if self.settings.get( "HTTPPROXY_ENABLED" ) else False proxy_dict = CommonClass.get_proxies( proxy_dict = {} ) self.proxies = proxy_dict['http'] self.svg_files = {} self.svg_urls = {} self.svg_file_dict = {} self.svg_file_contents = {} self.payload = {} self.class_mapping = {} self.class_mapping_updated = False self.key_length = 0
def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: urls = [ # 广州 "https://land.3fang.com/market/440100__1______1_1_1.html", # 住宅用地: 26页 "https://land.3fang.com/market/440100__2______1_1_1.html", # 商业/办公用地: 17页 "https://land.3fang.com/market/440100__3_2__0_100000__1_1_1.html", # 工业用地, 已成交, 10万平米以下: 32页 "https://land.3fang.com/market/440100__3_2__100000_500000__1_1_1.html", # 工业用地, 已成交, 10-50万平米: 4页 "https://land.3fang.com/market/440100__3_2__500000_100000000__1_1_1.html", # 工业用地, 已成交, 50万平米以上: 1页 "https://land.3fang.com/market/440100__3_1_____1_1_1.html", # 工业用地, 未成交: 1页 "https://land.3fang.com/market/440100__3_3_____1_1_1.html", # 工业用地, 流拍: 7页 "https://land.3fang.com/market/440100__4______1_1_1.html", # 其他用地: 4页 # # 佛山 "https://land.3fang.com/market/440600__1_1_____1_1_1.html", # 住宅用地, 未成交: 8页 "https://land.3fang.com/market/440600__1_2__0_5000__1_1_1.html", # 住宅用地, 已成交, 5千平米以下: 33页 "https://land.3fang.com/market/440600__1_2__5000_100000__1_1_1.html", # 住宅用地, 已成交, 5千到10万平米: 29页 "https://land.3fang.com/market/440600__1_2__100000_100000000__1_1_1.html", # 住宅用地, 已成交, 10万平米以上: 6页 "https://land.3fang.com/market/440600__1_3_____1_1_1.html", # 住宅用地, 流拍: 3页 "https://land.3fang.com/market/440600__2______1_1_1.html", # 商业用地: 19页 "https://land.3fang.com/market/440600__3_1_____1_1_1.html", # 工业用地, 未成交: 6页 "https://land.3fang.com/market/440600__3_2__0_40000__1_1_1.html", # 工业用地, 已成交, 4万平米以下: 32页 "https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html", # 工业用地, 已成交, 4万平米以上: 12页 "https://land.3fang.com/market/440600__3_3_____1_1_1.html", # 工业用地, 流拍: 1页 "https://land.3fang.com/market/440600__4______1_1_1.html", # 其他用地: 3页 ] meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] cookie_dict = dict([ pair.split("=", 1) for pair in self.cookie_string.split("; ") ]) self.cookie_dict = cookie_dict for url in urls: url_object = parse.urlparse(url) path_list = url_object.path.split("/") for one in path_list: if -1 == one.find(".html"): continue city_name = "" city_code_list = one.split("_") city_code = int( city_code_list[0]) if 0 < len(city_code_list) else 0 if 0 < city_code and str( city_code) in self.city_name_dict.keys(): city_name = self.city_name_dict[str(city_code)] if 1 > len(city_name): error_msg = f"{city_code} is NOT in self.city_name_dict.keys() ({self.city_name_dict.keys()})" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) sys.exit(4) break meta_dict["city"] = city_name # cookie_dict = self.change_cookies( cookie_dict ) yield scrapy.Request(url=url, cookies=cookie_dict, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True ) elif "READ_CSV_AND_REDO" == self.run_purpose: english_city_name = { "佛山": "foshan", "广州": "guangzhou", } filename = "tudi_201808.csv" csv_file_path = os.path.join(self.crawled_dir, filename) url_list = [] city_list = [] try: with open(csv_file_path, newline="", encoding="utf-8") as csvfile: file_reader = csv.reader( csvfile) # , delimiter=' ', quotechar='|' for row in file_reader: if -1 < row[8].find("https:"): url_list.append(row[8]) city_list.append(row[13]) except Exception as ex: error_msg = f"cannot read csv file, Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) meta_dict = { "page_type": "detailed", "total_pages": 1, } self.cookie_dict = dict([ pair.split("=", 1) for pair in self.cookie_string.split("; ") ]) if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for index, url in enumerate(url_list): chinese_city_name = city_list[index] meta_dict["city"] = english_city_name[chinese_city_name] yield scrapy.Request(url=url, cookies=self.cookie_dict, callback=self.parse_detailed_page, meta=meta_dict, dont_filter=True) break elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies(proxy_dict={}) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug)
def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: if "city" == self.city_name_for_districts: city_list = self.city_list else: city_list = self.district_list number_day_of_this_year = datetime.datetime.now().timetuple( ).tm_yday # type == int seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3) if seperate_into_days > len(city_list): seperate_into_days = len(city_list) batch_count = math.ceil(len(city_list) / seperate_into_days) today_batch = number_day_of_this_year % seperate_into_days start_index = today_batch * batch_count - 1 end_index = (today_batch + 1) * batch_count urls = [] for index, city in enumerate(city_list): if (start_index < index) and (index < end_index): url = f"https://{city}.esf.fang.com/" if "city" == self.city_name_for_districts else f"https://{self.city_name_for_districts}.esf.fang.com/house-{city}/" urls.append(url) meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } if "city" != self.city_name_for_districts: meta_dict["index_level"] = 1 if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] for url in urls: yield scrapy.Request(url=url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "GET_CHANNELS" == self.run_purpose: # GET_CHANNELS is one kind of debug urls = [] city_list = self.settings.get("CITY_LIST", default=[]) for index, city in enumerate(city_list): urls.append(f"https://{city}.esf.fang.com/") if 0 < len(urls): meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } yield scrapy.Request(url=urls[0], callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies(proxy_dict={}) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug)
def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: urls = [ # 只有广州有阳光家缘 "http://zfcj.gz.gov.cn/data/Laho/ProjectSearch.aspx", ] meta_dict = { "page_type": "index", "page": 1, "total_pages": 468, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] for url in urls: # yield scrapy.Request( url = url, cookies=cookie_dict, callback = self.parse_list_page, meta = meta_dict, dont_filter = True ) yield scrapy.Request(url=url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "GET_CHANNELS" == self.run_purpose: # GET_CHANNELS is one kind of debug urls = [] city_list = self.settings.get("CITY_LIST", default=[]) for index, city in enumerate(city_list): urls.append(f"https://{city}.esf.fang.com/") if 0 < len(urls): meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } yield scrapy.Request(url=urls[0], callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies(proxy_dict={}) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug)