def get(self): url = "https://www.cdfangxie.com/Infor/type/typeid/36.html" html = requests.get(url, verify=False, headers=getHeader()) bs = BeautifulSoup(html.text, "html.parser") tables = bs.find(class_='right_cont') tables_a = tables.find_all("a") db = PickleShareDB('data') history_list = db.get("history_list", []) smtpObj = self.get_smtp() if not smtpObj: logger.error("smtp error") exit(-1) for row in tables_a: try: title = row["title"] url = "https://www.cdfangxie.com" + row["href"] if title in history_list: logger.info("Crawl:%s result:pass", title) else: logger.info("Crawl:%s result:send_email", title) if self.send_email(smtpObj, title, url): logger.info("send_email success") history_list.append(title) else: logger.info("send_email error") except: pass db["history_list"] = history_list smtpObj.quit()
def test_stress(tmpdir): db = PickleShareDB(tmpdir) import time,sys for i in range(100): for j in range(500): if i % 15 == 0 and i < 70: if str(j) in db: del db[str(j)] continue if j%33 == 0: time.sleep(0.02) db[str(j)] = db.get(str(j), []) + [(i,j,"proc %d" % os.getpid())] db.hset('hash',j, db.hget('hash',j,15) + 1 ) print(i, end=' ') sys.stdout.flush() if i % 10 == 0: db.uncache()
def test_stress(tmpdir): db = PickleShareDB(tmpdir) import time, sys for i in range(100): for j in range(500): if i % 15 == 0 and i < 70: if str(j) in db: del db[str(j)] continue if j % 33 == 0: time.sleep(0.02) db[str(j)] = db.get(str(j), []) + [(i, j, "proc %d" % os.getpid())] db.hset('hash', j, db.hget('hash', j, 15) + 1) print(i, end=' ') sys.stdout.flush() if i % 10 == 0: db.uncache()