示例#1
0
    def proxy_ip_pool(self):
        """
			迅联错误码10000		提取过快,请至少5秒提取一次
		"""
        if "DRAGONFLY" == self.proxy_agent:
            return CommonClass.get_proxies(proxy_dict={})
        now = time.time()
        need_new_proxy = False
        if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict):
            need_new_proxy = True
        elif "expire" not in self.proxy_ip_dict.keys():
            need_new_proxy = True
        elif now + 3 > self.proxy_ip_dict["expire"]:
            need_new_proxy = True
        if need_new_proxy:
            proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                headers={},
                params_for_proxy_ip={},
                setup_xunlian_dict={},
                need_setup_xunlian=False,
                logger=self.logger)
            if 1 > len(proxies_dict):
                return self.proxy_ip_dict  # still return the old ip dict or {}
            proxies_dict["expire"] = now + random.randint(
                self.min_proxy_ip_life_time,
                self.max_proxy_ip_life_time)  # set ip life time
            self.proxy_ip_dict = proxies_dict
        return self.proxy_ip_dict
示例#2
0
文件: cnemc1.py 项目: zouyaoji/Crawls
	def start_requests(self):
		self.init_self_attributes()
		self.make_dirs()

		if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug
			url = 'http://quotes.toscrape.com/page/1/'
			yield scrapy.Request( url = url, callback = self.read_and_parse )
		elif "PRODUCTION_RUN" == self.run_purpose:
			urls = [
				# "http://www.cnemc.cn/sssj/", # 中国环境监测总局,实时数据页面
				self.base_url,
			]
			meta_dict = {}
			if self.use_proxy:
				proxies_dict = self.proxy_ip_pool()
				if 1 > len( proxies_dict):
					sys.exit(3)
				meta_dict["proxy"] = proxies_dict["http"]
			
			formdata_dict = {} # 没有任何表单字段需要post给目标网站
			for url in urls:
				# yield scrapy.RequestForm( url = url, callback = self.parse_json, meta = meta_dict, dont_filter = True )
				# yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
				self.last_request_time = time.time()
				yield scrapy.FormRequest( url = url, formdata = formdata_dict, callback = self.parse_json, meta = meta_dict, dont_filter = True )
		elif "CHECK_PROXY_IP" == self.run_purpose:
			now = int(time.time())
			token = f"Guangzhou{str(now)}"
			m = hashlib.md5()  
			m.update( token.encode(encoding = 'utf-8') )
			urls = [
				f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
			]
			
			if "DRAGONFLY" == self.proxy_agent:
				proxies_dict = CommonClass.get_proxies( proxy_dict = {} )
			else:
				proxies_dict = ProxyAgent.get_xunlian_proxy_dict(headers = {}, params_for_proxy_ip={}, setup_xunlian_dict = {}, need_setup_xunlian = False, logger=self.logger )
			if 0 < len( proxies_dict):
				meta_dict = {
					"proxy": proxies_dict["http"]
				}
				for url in urls:
					yield scrapy.Request( url=url, callback=self.do_nothing_for_debug, meta = meta_dict )
			else:
				self.logger.error( f"Error! No proxy ip returns. {proxies_dict}" )
		else:
			urls = [
				"http://quotes.toscrape.com/page/1/",
				"http://quotes.toscrape.com/page/2/",
			]
			for url in urls:
				yield scrapy.Request( url=url, callback=self.do_nothing_for_debug )
示例#3
0
    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            urls = [
                # 广州
                "https://land.3fang.com/market/440100__1______1_1_1.html",  # 住宅用地: 26页
                "https://land.3fang.com/market/440100__2______1_1_1.html",  # 商业/办公用地: 17页
                "https://land.3fang.com/market/440100__3_2__0_100000__1_1_1.html",  # 工业用地, 已成交, 10万平米以下: 32页
                "https://land.3fang.com/market/440100__3_2__100000_500000__1_1_1.html",  # 工业用地, 已成交, 10-50万平米: 4页
                "https://land.3fang.com/market/440100__3_2__500000_100000000__1_1_1.html",  # 工业用地, 已成交, 50万平米以上: 1页
                "https://land.3fang.com/market/440100__3_1_____1_1_1.html",  # 工业用地, 未成交: 1页
                "https://land.3fang.com/market/440100__3_3_____1_1_1.html",  # 工业用地, 流拍: 7页
                "https://land.3fang.com/market/440100__4______1_1_1.html",  # 其他用地: 4页

                # # 佛山
                "https://land.3fang.com/market/440600__1_1_____1_1_1.html",  # 住宅用地, 未成交: 8页
                "https://land.3fang.com/market/440600__1_2__0_5000__1_1_1.html",  # 住宅用地, 已成交, 5千平米以下: 33页
                "https://land.3fang.com/market/440600__1_2__5000_100000__1_1_1.html",  # 住宅用地, 已成交, 5千到10万平米: 29页
                "https://land.3fang.com/market/440600__1_2__100000_100000000__1_1_1.html",  # 住宅用地, 已成交, 10万平米以上: 6页
                "https://land.3fang.com/market/440600__1_3_____1_1_1.html",  # 住宅用地, 流拍: 3页
                "https://land.3fang.com/market/440600__2______1_1_1.html",  # 商业用地: 19页
                "https://land.3fang.com/market/440600__3_1_____1_1_1.html",  # 工业用地, 未成交: 6页
                "https://land.3fang.com/market/440600__3_2__0_40000__1_1_1.html",  # 工业用地, 已成交, 4万平米以下: 32页
                "https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html",  # 工业用地, 已成交, 4万平米以上: 12页
                "https://land.3fang.com/market/440600__3_3_____1_1_1.html",  # 工业用地, 流拍: 1页
                "https://land.3fang.com/market/440600__4______1_1_1.html",  # 其他用地: 3页
            ]

            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            cookie_dict = dict([
                pair.split("=", 1) for pair in self.cookie_string.split("; ")
            ])
            self.cookie_dict = cookie_dict
            for url in urls:
                url_object = parse.urlparse(url)
                path_list = url_object.path.split("/")
                for one in path_list:
                    if -1 == one.find(".html"):
                        continue
                    city_name = ""
                    city_code_list = one.split("_")
                    city_code = int(
                        city_code_list[0]) if 0 < len(city_code_list) else 0
                    if 0 < city_code and str(
                            city_code) in self.city_name_dict.keys():
                        city_name = self.city_name_dict[str(city_code)]
                    if 1 > len(city_name):
                        error_msg = f"{city_code} is NOT in self.city_name_dict.keys() ({self.city_name_dict.keys()})"
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                        )
                        sys.exit(4)
                    break
                meta_dict["city"] = city_name
                # cookie_dict = self.change_cookies( cookie_dict )
                yield scrapy.Request(url=url,
                                     cookies=cookie_dict,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
                # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
        elif "READ_CSV_AND_REDO" == self.run_purpose:
            english_city_name = {
                "佛山": "foshan",
                "广州": "guangzhou",
            }
            filename = "tudi_201808.csv"
            csv_file_path = os.path.join(self.crawled_dir, filename)
            url_list = []
            city_list = []
            try:
                with open(csv_file_path, newline="",
                          encoding="utf-8") as csvfile:
                    file_reader = csv.reader(
                        csvfile)  # , delimiter=' ', quotechar='|'
                    for row in file_reader:
                        if -1 < row[8].find("https:"):
                            url_list.append(row[8])
                            city_list.append(row[13])
            except Exception as ex:
                error_msg = f"cannot read csv file, Exception = {ex}"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )

            meta_dict = {
                "page_type": "detailed",
                "total_pages": 1,
            }
            self.cookie_dict = dict([
                pair.split("=", 1) for pair in self.cookie_string.split("; ")
            ])
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                meta_dict["proxy"] = proxies_dict["http"]

            for index, url in enumerate(url_list):
                chinese_city_name = city_list[index]
                meta_dict["city"] = english_city_name[chinese_city_name]
                yield scrapy.Request(url=url,
                                     cookies=self.cookie_dict,
                                     callback=self.parse_detailed_page,
                                     meta=meta_dict,
                                     dont_filter=True)
                break
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)
示例#4
0
    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            if "city" == self.city_name_for_districts:
                city_list = self.city_list
            else:
                city_list = self.district_list
            number_day_of_this_year = datetime.datetime.now().timetuple(
            ).tm_yday  # type == int
            seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3)
            if seperate_into_days > len(city_list):
                seperate_into_days = len(city_list)
            batch_count = math.ceil(len(city_list) / seperate_into_days)
            today_batch = number_day_of_this_year % seperate_into_days
            start_index = today_batch * batch_count - 1
            end_index = (today_batch + 1) * batch_count
            urls = []
            for index, city in enumerate(city_list):
                if (start_index < index) and (index < end_index):
                    url = f"https://{city}.esf.fang.com/" if "city" == self.city_name_for_districts else f"https://{self.city_name_for_districts}.esf.fang.com/house-{city}/"
                    urls.append(url)

            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            if "city" != self.city_name_for_districts:
                meta_dict["index_level"] = 1

            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "GET_CHANNELS" == self.run_purpose:  # GET_CHANNELS is one kind of debug
            urls = []
            city_list = self.settings.get("CITY_LIST", default=[])
            for index, city in enumerate(city_list):
                urls.append(f"https://{city}.esf.fang.com/")
            if 0 < len(urls):
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": 0,
                }
                yield scrapy.Request(url=urls[0],
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)
示例#5
0
    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            urls = [
                # 只有广州有阳光家缘
                "http://zfcj.gz.gov.cn/data/Laho/ProjectSearch.aspx",
            ]

            meta_dict = {
                "page_type": "index",
                "page": 1,
                "total_pages": 468,
            }
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            for url in urls:
                # yield scrapy.Request( url = url, cookies=cookie_dict, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "GET_CHANNELS" == self.run_purpose:  # GET_CHANNELS is one kind of debug
            urls = []
            city_list = self.settings.get("CITY_LIST", default=[])
            for index, city in enumerate(city_list):
                urls.append(f"https://{city}.esf.fang.com/")
            if 0 < len(urls):
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": 0,
                }
                yield scrapy.Request(url=urls[0],
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)