def Mongo2Csv(): doc = [] try: pdCollection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") total = 0 while True: pdList = pdCollection.nextPage(10) if pdList == None or len(pdList) == 0: break for pd in pdList: if pd['state'] != 'pass': continue newID = createPost(pd) if newID != None: doc.append({"_id": pd['_id'], "ID": newID, "brand": pd['brand'], "url": pd['url'], "state": "posted", "price": pd['price'], "title": pd['title'], "brand_a": pd['brand_a'] , "inner_des": pd['inner_des']}) print(doc[0]['ID']) pdCollection.updateOne(doc) doc.clear() print("create post ok") else: print("create post error") total += 1 print('total=' + str(total) + ', title=' + pd['title']) return print('Creawte all posts ok') except Exception as err: print(err)
def fetchAllBlog(): try: catCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'category') collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'blog') total = 0 while True: blogList = collection.nextPage(100) if len(blogList) == 0: break for blog in blogList: if blog['state'] == 'CLOSED': fileName = HttpHelper.fetchAndSave(blog['url'], "utf-8", HTML_ROOT_PATH) if fileName != None and len(fileName) > 0: blog['fileName'] = fileName blog['state'] = "FETCHED" else: blog['state'] = "CLOSED" collection.updateOne(blog) total += 1 print("url=" + blog['url']) print("total=" + str(total)) except Exception as err: print(err) finally: print("exit")
def test(): collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, "pages") doclist = [] total = 1 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for page in doclist: try: if page['state'] != 'fetched': continue prefix = page['filename'][0:1] filepath = DOMAIN + prefix + "/" + page['filename'] with open(filepath, encoding="utf-8") as fp: html = fp.read() task = { "id": "id", "url": page['url'], 'topic': 'crawler_data_p123', 'routingKey': '256' } # dx.com 225, banggood.com 224 ,tomtop 195 ,gearbest 256 sendPage(task, html) fp.close() page['state'] = 'sended' collection.updateOne(page) print(total) total += 1 except Exception as err: print(err)
def test2(): collection = MongoHelper("172.16.40.140", 27017, "ZDBGearbestCom", "pages") while True: slist = collection.nextPage(10) if len(slist) == 0: break for article in slist: article['state'] = "fetched" collection.updateOne(article)
def parseAllBlog(): try: collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'blog') total = 0 while True: blogList = collection.nextPage(100) if len(blogList) == 0: break for blog in blogList: if blog['state'] == 'PARSED': filePath = HttpHelper.getFullPath(HTML_ROOT_PATH, blog['fileName']) with open(filePath, 'r', encoding='utf-8') as file: html = file.read() found, title, desc, ogTitle, ogDesc, twTitle, twDesc, keywords, content, author, summary, summaryKeywords = parseBlog( html) if found and (title != None or ogTitle != None or twTitle != None): doc = { 'title': title, 'ogTitle': ogTitle, 'twTitle': twTitle, 'desc': desc, 'ogDesc': ogDesc, 'twDesc': twDesc, 'keywords': keywords, 'content': content, 'author': author, 'summary': summary, 'summaryKeywords': summaryKeywords } blog['doc'] = doc blog['state'] = 'PARSED' collection.updateOne(blog) print("ok") else: print("error") break total += 1 print("url=" + blog['url']) print("total=" + str(total)) except Exception as err: print(err) finally: print("exit")
def Resolve(): collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, "pages") doclist = [] while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for article in doclist: try: if article['state'] != "fetched": continue prefix = article['filename'][0:1] filepath = DOMAIN + prefix + "/" + article['filename'] with open(filepath, encoding="utf-8") as fp: html = fp.read() p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12 = parseBlog( html) md5 = CryptHelper.getMD5Hash(article['url']) key = UrlHelper.getHostPath(article['url'])[1] excerpt = p3 if p3 else p11 doc = { "_id": article['_id'], "filename": article['filename'], "url": article['url'], "state": "pass", "domain": article['domain'], 'md5': md5, 'title': p2, 'excerpt': excerpt, 'content': p9, 'author': article['domain'], 'categories': CATEGORY, 'tags': "", 'status': 0, 'key': key } collection.updateOne(doc) doc.clear() fp.close() except Exception as err: print(err) '''for i in doc:
def importAllArticle(MONGO_HOST, MONGO_DATABASE_NAME, IMPORT_URL): try: articleCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'pages') total = 0 while True: articleList = articleCollection.nextPage(10) if len(articleList) == 0: break total += len(articleList) print("total=" + str(total)) newArticleList = [] for article in articleList: if article['state'] != "pass": continue #print (str(article['_id'])) doc = { 'id': article['md5'], 'title': article['title'], 'excerpt': article['excerpt'], 'content': "", 'author': article['domain'], 'domain': article['domain'], 'categories': article['categories'], 'tags': article['tags'], 'url': article['url'], 'status': article['status'], 'key': article['key'], } newArticleList.append(doc) errorCode, rsp = HttpHelper.post(IMPORT_URL, newArticleList) if errorCode == "OK" and rsp != None and 'isOk' in rsp and rsp[ 'isOk'] == True: print("import article ok") else: print("import article error") newArticleList.clear() article['state'] = "sended" articleCollection.updateOne(article) except Exception as err: print(err) finally: print("exit")
def test3(): collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy", "url") doclist = [] total = 0 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for i in doclist: if i['state'] != "completed": # 只操作经过test2()处理的数据 continue contenthtml = "" for j in i['attrlist']: contenthtml1 = '<h3 class="h3-subtitle">' + j['subtitle'] + '</h3><br/>' \ + '<div class="div-content>'+ j['innerhtml'] + '</div><br/>' contenthtml += contenthtml1 doc = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "built", "title": i['title'], "content": i['content'], "description": i['description'], "attrlist": i['attrlist'], "contenthtml": contenthtml } collection.updateOne(doc) doc.clear() total += 1 print(total)
def amazonfetch_detail(): doclist = [] total = 1 collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for x in doclist: if x['state'] != "fetched": continue try: status, html = HttpHelper.fetch(x['url']) soup = BeautifulSoup(html, "lxml") title = soup.find_all("span", attrs={"id": "productTitle"}) for i in title: text = i.text title = text.strip() a = soup.find_all("a", attrs={"id": "bylineInfo"}) # bylineInfo brand for i in a: href = i['href'] if re.match("^/{1}.*", href): href = "http://www.amazon.com" + href description = soup.find_all("ul", attrs={"class": "a-unordered-list a-vertical a-spacing-none"}) for i in description: doc = {"_id": x['_id'], "brand": x['brand'], "url": x['url'], "state": "pass", "price": x['price'] , "title": title, "brand_a": href, "inner_des": str(i)} collection.updateOne(doc) doc.clear() except Exception as err: print(err) continue print(total) total += 1
def updateAllArticle(MONGO_HOST, MONGO_DATABASE_NAME, IMPORT_URL): try: articleCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'article') total = 0 while True: articleList = articleCollection.nextPage(10) if len(articleList) == 0: break newArticleList = [] for article in articleList: total += 1 print("total=" + str(total)) url = article['url'] retry = 0 while True: retry += 1 if retry > 2: break statusCode, html = HttpHelper.fetch(url) if html != None and len(html) > 0: article['status'] = 0 # Check title, TODO print("update article ok, retry=" + str(retry) + ", url=" + url) break else: article['status'] = -1 print("update article error, retry=" + str(retry) + ", url=" + url) time.sleep(1) article['updateTime'] = datetime.now() articleCollection.updateOne(article) except Exception as err: print(err) finally: print("exit")
def createAllPost(): try: pdCollection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy") total = 0 while True: pdList = pdCollection.nextPage(10) if pdList == None or len(pdList) == 0: break for pd in pdList: if pd['state'] != 'built': continue newID = createPost(pd) if newID != None: doc = {"_id": pd['_id'], "ID":newID, "cat": pd['cat'], "fileName": pd['fileName'], "url": pd['url'], "host": pd['host'], "state": "posted", "title": pd['title'], "content": pd['content'], "description": pd['description'], "attrlist": pd['attrlist'], "contenthtml": pd['contenthtml']} print(doc['ID']) pdCollection.updateOne(doc) doc.clear() print ("create post ok") else: print ("create post error") total += 1 print('total=' + str(total) + ', title=' + pd['title']) print ('Creawte all posts ok') except Exception as err : print(err)
def test_chromedriver(): try: total = 1 doclist = [] collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for page in doclist: print(total) total += 1 if page['state'] != 'pass': continue driver = webdriver.Chrome('C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe') driver.get(page['url']) print("wait for u") size = driver.find_elements_by_tag_name("img") for i in size: if i.location['x'] == 19 or i.location['x'] == 71: if i.size == {'height': 40, 'width': 40}: ActionChains(driver).move_to_element(i).click(i).perform() html = driver.page_source.encode('utf-8') driver.close() soup = BeautifulSoup(html, "lxml") with open("./product.csv", "a+", newline='', encoding="utf-8") as c: writer = csv.writer(c, dialect='excel') list = soup.find_all("div", attrs={"class": "imgTagWrapper"}) img = "" for i in list: imge = i.find_all("img") for j in imge: img = img + j['src'] + "," img = img[0:-1] price = "" pricetxt = soup.find_all("span", attrs={"id": "priceblock_ourprice"}) for i in pricetxt: price = i.text price = price.strip() des = "" text2 = soup.find_all("div", attrs={"id": "productDescription"}) '''div,class:aplus-v2 desktop celwidget div id: productDescription ''' for i in text2: des = i des = des.encode("utf-8").decode() des = des.strip() des_html = "<div class=\"productdescription\">" + des +"</div>" img = img.encode("utf-8").decode() img = img.strip() sdes = page['inner_des'] sdes = "<div class = \"short-des\">" + "<a href = \"" + page['brand_a'] + "\">" + \ "<font size=1 color=blue>" + page['brand'] + "</font></a><br>About the product<br>"\ + sdes + "</div>" writer.writerow(['', 'simple', '', page['title'], '1', '0', 'visible', sdes, des_html, '', '', 'taxable', '', '1', '', '0', '0', '', '', '', '', '1', '', '', price, 'blood glucose meter', '', '', img, '', '', '', '', '', '', '', '', '0']) print("csv ok") doc = {"_id": page['_id'], "brand": page['brand'], "url": page['url'], "state": "posted", "price": price, "title": page['title'], "brand_a": page['brand_a'], "inner_des": page['inner_des'], "product_des": des} collection.updateOne(doc) doc.clear() print("mongo ok") c.close() except Exception as err: print(err)
def test2(): ''' 以下过程为提取collection中的所有url,最后得到集合doclist :return: ''' collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy", "url") nlp = NLPHelper() doclist = [] doc = [] # doc作为新key:attrlist的 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) # print(doclist) total = 0 ''' 以下过程为解析每一个url ''' for i in doclist: if i['state'] != "FETCHED": # 只采集state为FETCHED的对象 continue # 来源为一号网站时 if i['cat'] == 1: html = HttpHelper.fetch(i['url']) soup = BeautifulSoup( html[1]) # html结构为[statusCode, html],所采集标号为1的元素 slist = soup.find_all("section") content = "" # 初始化content为一个空字符串 for j in slist: # j是section hlist = j.find_all("h2") # tag为h2的都是小标题 for x in hlist: # x是title title = x.text tlist = j.find_all("div", attrs={"class": "section-body"}) for y in tlist: # y是具体一个上面小标题对应的内容 doc.append({ "subtitle": title, "innerhtml": str(y), "text": y.text }) content += y.text # 总的一页内容是每一个小标题的内容之和 description = nlp.getSummary(content, wordCount=20) # 创建描述 ''' 加入要覆盖当前collection的doc ''' doc2 = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "completed", "title": i['title'], "content": content, "description": description, "attrlist": doc } collection.updateOne(doc2) doc.clear() doc2.clear() # 每完成一次更新将两个doc清空 total += 1 print(total) # 打印出当前完成的document总数 # 来源二号网站,原理基本相同 elif i['cat'] == 2: html = HttpHelper.fetch(i['url']) soup = BeautifulSoup(html[1]) slist = soup.find_all( "div", attrs={"class": re.compile('field field-name-body*')}) content = "" for j in slist: hlist = j.find_all("h2") titlearr = [] # 用来存放当前页面的小标题,以便在插入时与innerhtml一一对应 for x in hlist: # x是title title = x.text titlearr.append(title) tlist = j.find_all("ul") index = 0 for y in tlist: if index > len(titlearr) - 1: # 防止索引越界 break doc.append({ "subtitle": str(titlearr[index]), "innerhtml": str(y), "text": y.text }) content += y.text index = index + 1 titlearr.clear() #print(content) description = nlp.getSummary(content, wordCount=20) doc2 = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "completed", "title": i['title'], "content": content, "description": description, "attrlist": doc } collection.updateOne(doc2) doc.clear() doc2.clear() # print(j) total += 1 print(total)