def getTopic(loop, url): topicHtml = yield from wget(url) if len(topicHtml) > 100: soup = BeautifulSoup(topicHtml, 'html.parser') topic = [] isImg = False m = re.match(r'^http.+/(\d+)', url) topicId = m.group(1) if m is not None else 0 for content in soup.find_all(checkTopic): img = content.find(class_="BDE_Image") if img is not None and isImg == False: isImg = True imgUrl = img.get("src") topic.append(content.get_text().strip()) if isImg and len(topic) > 6: topicTitle = soup.title.string m = re.match(r'^http.+/(.+)', imgUrl) imgName = m.group(1) logging.info('=============%s==============' % soup.title.string) logging.info(url) yield from orm.create_pool(loop=loop, host=dbHost, user=dbUser, password=dbPassword, db=dbName) num = yield from WeaponChangeTopic.findNumber( 'id', 'topicId=?', topicId) if num is None: words = yield from getImgWords(imgUrl) try: wordList = json.loads(words)['retData'] if len(wordList) > 0: logging.info(wordList[0]["word"].strip()) weaponTitle = wordList[0]["word"] weapon = WeaponChangeTopic( comefrom="tieba", topicId=topicId, title=weaponTitle, details=json.dumps(wordList), topicTitle=topicTitle, topicList=json.dumps(topic), img=imgName) yield from weapon.save() except ValueError as e: print("baidu ocr json error: %s" % words)
def getTopic(loop,url): topicHtml = yield from wget(url) if len(topicHtml) > 100: soup = BeautifulSoup(topicHtml, 'html.parser') topic = [] isImg = False m = re.match(r'^http.+/(\d+)', url) topicId = m.group(1) if m is not None else 0 for content in soup.find_all(checkTopic): img = content.find(class_="BDE_Image") if img is not None and isImg == False: isImg = True imgUrl = img.get("src") topic.append(content.get_text().strip()) if isImg and len(topic) > 6: topicTitle = soup.title.string m = re.match(r'^http.+/(.+)', imgUrl) imgName = m.group(1) logging.info('=============%s==============' % soup.title.string) logging.info(url) yield from orm.create_pool(loop=loop,host=dbHost, user=dbUser, password=dbPassword, db=dbName) num = yield from WeaponChangeTopic.findNumber('id', 'topicId=?', topicId) if num is None: words = yield from getImgWords(imgUrl) try: wordList = json.loads(words)['retData'] if len(wordList) > 0: logging.info(wordList[0]["word"].strip()) weaponTitle = wordList[0]["word"] weapon = WeaponChangeTopic(comefrom = "tieba", topicId = topicId, title = weaponTitle, details = json.dumps(wordList), topicTitle = topicTitle, topicList = json.dumps(topic), img = imgName ) yield from weapon.save() except ValueError as e: print("baidu ocr json error: %s" % words)
def getTopic(loop,url): # 模拟手机header topicHtml = yield from wget(url, {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'} ) if len(topicHtml) > 100: soup = BeautifulSoup(topicHtml, 'html.parser') topic = [] isImg = False m = re.match(r'^http.+tid=(\d+)', url) topicId = m.group(1) if m is not None else 0 # 帖子列表 for content in soup.find_all(checkTopic): topic.append(content.get_text().strip()) topicTitle = soup.find("h2").get_text().replace(' ','').strip() topicTitle = topicTitle.replace('只看楼主','') #print('-------------------------------') #print(topicTitle) #print(url) # 获取图片页面链接 imgA = soup.find("ul", class_="img_one") if imgA is not None: imgUrl = imgA.find("a").get("href") if imgUrl is not None: # 从图片页面获取图片真实地址 imgHtml = yield from wget('http://bbs.d.163.com/'+imgUrl, {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'} ) if len(imgHtml) > 100: soup2 = BeautifulSoup(imgHtml, 'html.parser') imgUrl = soup2.find("img", class_="postalbum_i").get("orig") if imgUrl is not None: isImg = True if isImg and len(topic) > 6: m = re.match(r'^http.+/(.+)\.(.+)', imgUrl) imgType = m.group(2) imgName = m.group(1) + "." + imgType # baidu ocr不支持png if imgType == "png": return yield from orm.create_pool(loop=loop,host=dbHost, user=dbUser, password=dbPassword, db=dbName) num = yield from WeaponChangeTopic.findNumber('id', 'topicId=?', topicId) if num is None: words = yield from getImgWords(imgUrl) try: wordList = json.loads(words)['retData'] if len(wordList) > 0: logging.info(wordList[0]["word"].strip()) weaponTitle = wordList[0]["word"] #print(weaponTitle) #print(imgUrl) weapon = WeaponChangeTopic(comefrom = "163", topicId = topicId, title = weaponTitle, details = json.dumps(wordList), topicTitle = topicTitle, topicList = json.dumps(topic), img = imgName ) yield from weapon.save() except ValueError as e: print("baidu ocr json error: %s" % words)
def getTopic(loop, url): # 模拟手机header topicHtml = yield from wget( url, { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4' }) if len(topicHtml) > 100: soup = BeautifulSoup(topicHtml, 'html.parser') topic = [] isImg = False m = re.match(r'^http.+tid=(\d+)', url) topicId = m.group(1) if m is not None else 0 # 帖子列表 for content in soup.find_all(checkTopic): topic.append(content.get_text().strip()) topicTitle = soup.find("h2").get_text().replace(' ', '').strip() topicTitle = topicTitle.replace('只看楼主', '') #print('-------------------------------') #print(topicTitle) #print(url) # 获取图片页面链接 imgA = soup.find("ul", class_="img_one") if imgA is not None: imgUrl = imgA.find("a").get("href") if imgUrl is not None: # 从图片页面获取图片真实地址 imgHtml = yield from wget( 'http://bbs.d.163.com/' + imgUrl, { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4' }) if len(imgHtml) > 100: soup2 = BeautifulSoup(imgHtml, 'html.parser') imgUrl = soup2.find("img", class_="postalbum_i").get("orig") if imgUrl is not None: isImg = True if isImg and len(topic) > 6: m = re.match(r'^http.+/(.+)\.(.+)', imgUrl) imgType = m.group(2) imgName = m.group(1) + "." + imgType # baidu ocr不支持png if imgType == "png": return yield from orm.create_pool(loop=loop, host=dbHost, user=dbUser, password=dbPassword, db=dbName) num = yield from WeaponChangeTopic.findNumber( 'id', 'topicId=?', topicId) if num is None: words = yield from getImgWords(imgUrl) try: wordList = json.loads(words)['retData'] if len(wordList) > 0: logging.info(wordList[0]["word"].strip()) weaponTitle = wordList[0]["word"] #print(weaponTitle) #print(imgUrl) weapon = WeaponChangeTopic( comefrom="163", topicId=topicId, title=weaponTitle, details=json.dumps(wordList), topicTitle=topicTitle, topicList=json.dumps(topic), img=imgName) yield from weapon.save() except ValueError as e: print("baidu ocr json error: %s" % words)