def get_urls(self): """ by parse start url to get detail urls for each book :return: detail url list """ element = get_element(targetUrl=self.start_url, workLogger=novellogger, headers={"User-Agent": user_agent()}) trs = element.xpath('.//tbody[@id="rankList"]//tr')[1:] for tr in trs: link = tr.xpath(".//a[@target='_blank']/@href") if len(link) > 0: chuangShiNovelUrlQueue.put(link[0])
def _check_and_save(): """ get data from novelDataQueue and check this data dictionary if saved by finger print in redis database, if not save it """ while True: data_dict = qiDianNovelDataQueue.get() data_finger = data_fingerprint(data_dict["name"], data_dict["author"], data_dict["link"]) result = novel_redis_client.sadd("novel_data_finger", data_finger) if result == 0: novellogger.info( "data repeat which finger with :{}".format(data_finger)) if result == 1: headers = {"User-Agent": user_agent(), "Connection": "close"} detail_element = get_element(targetUrl=data_dict["link"], workLogger=novellogger, headers=headers) wordCount = detail_element.xpath( './/div[@class="book-info "]//p[3]/em[1]/text()') intro = detail_element.xpath( '//div[@class="book-content-wrap cf"]//div[@class="book-intro"]//text()' ) link = detail_element.xpath( '//a[@class="red-btn J-getJumpUrl "]/@href') data_dict["source"] = "起点中文网" data_dict["wordCount"] = float( wordCount[0]) if len(wordCount) > 0 else None data_dict["link"] = "https:" + link[0] if len(link) > 0 else None data_dict["intro"] = "".join( intro).strip() if len(intro) > 0 else None novel_mongo_clinet.insert_one(data_dict) novellogger.info("save data :{}".format(data_dict)) qiDianNovelDataQueue.task_done()
def parse_list_urls(self): """ :param urlQueue: to give target url :param elementQueue: to save the element of the page by parse url :param workLogger: to record the work information :param redisClient: save url finger print :param redisKey: the redis data save key for url finger print :return: None """ while True: # every time to parse url use different User-Agent headers = {"User-Agent": user_agent(), "Connection": "close"} # use function import from customTools.respDownloader parse_target_urls(urlQueue=qiDianNovelUrlQueue, elementQueue=qiDianNovelElementQueue, headers=headers, workLogger=novellogger, redisClient=novel_redis_client, redisKey="novel_url_finger")
def __init__(self): self.start_url = "https://www.qidian.com/rank/recom?dateType=2&chn=9&page={}" self.headers = {"User-Agent": user_agent()}
def __init__(self): self.start_url = "http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=0" self.headers = {"User-Agent": user_agent()}