def get_search(url, keyword, ac): browser = get_browser(binary=firefox_binary) browser.get(url) time.sleep(15) print "get_search url done" page_source = browser.page_source soup = BeautifulSoup(page_source, "lxml") [s.extract() for s in soup('script')] ID = DBkeywords.findkeywordID(keyword) resultofitems = soup.find_all("div", {"class": "c-container"}) print "get_search baidu :" + url # while resultofitems==None : # print "searching page cannot load" # WebDriverWait(browser,30) # browser = get_browser(binary=firefox_binary) # browser.get(url) # resultofitems = soup.find_all("div",{"class":"c-container"}) for item in resultofitems: try: time.sleep(15) Tittle = 'cannot load ' abstract = 'cannot load ' currenturl = 'cannot load url' further_url = item.a["href"] Tittle = item.a.get_text() print Tittle abstract = item.find_all("div")[0].get_text() print abstract # thread.start_new_thread(getextraweb.baidufurthersearch,(ID[0],Tittle,abstract,further_url,ac)) browser.set_page_load_timeout(180) browser.get(further_url) try: WebDriverWait(browser, 20).until( EC.presence_of_element_located( browser.find_element_by_xpath("*/body"))) print "Page is ready!" except TimeoutException: print "Loading took too much time!" time.sleep(30) currenturl = (browser.current_url) print currenturl body = browser.page_source test_DB.baidu(ID[0], Tittle, currenturl, abstract, body, currentdate.getdate(), ac) # except AttributeError: # print str(AttributeError) # test_DB.suspect('baidu',-1,ID[0],currenturl,3,currentdate.getdate(),'AttributeError') # continue except TimeoutException, e2: print "Timeout, retrying..." test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), str(e2), Tittle, abstract) time.sleep(30) continue except Exception, e: print "caught exception :site:" + currenturl + "keyword: " + keyword test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), str(e), Tittle, abstract) print str(e) continue
def baidufurthersearch(url): print "getextraweb:" + url currenturl = url try: browser = get_browser(binary=firefox_binary) print url # browser.get(checkhttp(url)) browser.get(url) page_source = browser.page_source currenturl = browser.current_url soup = BeautifulSoup(page_source, "lxml") [s.extract() for s in soup('script')] body = cleanup(soup.get_text()) browser.quit() except AttributeError: print str(AttributeError) test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), 'AttributeError', Tittle, abstract) # browser.quit() # sys.exit() except Exception, e: print "caught exception :site:" + currenturl + "keyword: " + keyword test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), str(e), Tittle, abstract) print str(e)
def get_search(url, keyword, ac): browser = get_browser(binary=firefox_binary) browser.get(url) counserat = 0 page_source = browser.page_source soup = BeautifulSoup(page_source, "lxml") [s.extract() for s in soup('script')] ID = DBkeywords.findkeywordID(keyword) resultofitems = soup.find("div", { "class": "l-web-results" }).find_all("div", {"class": "web-result"}) for item in resultofitems: try: print str(counserat) # abstract = element.find("span",{"class":"st"}).get_text() Tittle = item.find("h3", {"class": "web-result-title"}).get_text() further_url = 'cannot load url' further_url = item.find("p", { "class": "web-result-url" }).get_text() abstract = item.find("p", { "class": "web-result-description" }).get_text() if (DBkeywords.checktrustful(further_url)): print "trust url:" + str(further_url) continue print "working on:" + further_url # browser.set_page_load_timeout(180) browser.get(getextraweb.checkhttp(further_url)) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" body = browser.page_source test_DB.search(ID[0], Tittle, further_url, abstract, body, currentdate.getdate(), ac) counserat = counserat + 1 except AttributeError: print str('AttributeError') test_DB.suspect('search', -1, ID[0], further_url, 3, currentdate.getdate(), 'AttributeError', Tittle, abstract) continue except Exception, e: print "caught exception :site:" + further_url + "keyword: " + keyword test_DB.suspect('search', -1, ID[0], further_url, 3, currentdate.getdate(), str(e), Tittle, abstract) print str(e) continue
def get_search(url,keyword,ac): browser = get_browser(binary=firefox_binary) browser.get(url) WebDriverWait(browser,30) page_source =browser.page_source soup = BeautifulSoup(page_source,"lxml") [s.extract() for s in soup('script')] ID =DBkeywords.findkeywordID(keyword) resultofitems = soup.find("ul",{"content":"ALGO"}).find_all("li") Tittle ='cannot load' abstract ='cannot load' for item in resultofitems: try: # abstract = element.find("span",{"class":"st"}).get_text() print "start" further_url ='cannot load url' Tittle = item.a.get_text() further_url = item.a["href"] print further_url abstract = item.find("p",{"property":"f:desc"}).get_text() if(DBkeywords.checktrustful(further_url)): print "trust url:"+str(further_url) continue print "working on:"+further_url WebDriverWait(browser,30) # browser.set_page_load_timeout(180) browser.get(getextraweb.checkhttp(further_url)) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" currenturl = (browser.current_url) # soupfurther = BeautifulSoup(browser.page_source,"lxml") # [s.extract() for s in soupfurther('script')] # body = cleanup(soupfurther.get_text()) body = browser.page_source test_DB.aol(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac) except AttributeError: print str(AttributeError) test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) continue except Exception, e: print "caught exception :site:"+further_url +"keyword: "+keyword test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract) print str(e) continue
def get_search(url,keyword,ac): browser = get_browser(binary=firefox_binary) browser.get(url) page_source =browser.page_source soup = BeautifulSoup(page_source,"lxml") [s.extract() for s in soup('script')] ID =DBkeywords.findkeywordID(keyword) resultofitems = soup.find_all("div",{"class":"PartialSearchResults-item"}) for item in resultofitems: try: time.sleep(10) further_url ='cannot load url' Tittle ='cannot load url' abstract = 'cannot load url' Tittle = item.find("div",{"class":"PartialSearchResults-item-title"}).get_text() print Tittle further_url = item.find("p",{"class":"PartialSearchResults-item-url"}).get_text() print further_url abstract = item.find("p",{"class":"PartialSearchResults-item-abstract"}).get_text() print abstract if(DBkeywords.checktrustful(further_url)): print "trust url:"+str(further_url) continue print "working on:"+further_url # browser.set_page_load_timeout(180) browser.get(getextraweb.checkhttp(further_url)) # soupfurther = BeautifulSoup(browser.page_source,"lxml") # [s.extract() for s in soupfurther('script')] # # body = cleanup(soupfurther.get_text()) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" time.sleep(30) body = browser.page_source test_DB.ask(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac) # except AttributeError: # print str('AttributeError') # test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) # continue except Exception, e: print "caught exception :site:"+further_url +"keyword: "+keyword test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract) print str(e) continue
def get_search(url, keyword, ac): browser = get_browser(binary=firefox_binary) browser.get(url) page_source = browser.page_source soup = BeautifulSoup(page_source, "lxml") [s.extract() for s in soup('script')] letters = soup.find_all("li", {"class": "b_algo"}) counserat = 0 ID = DBkeywords.findkeywordID(keyword) for element in letters: try: print str(counserat) further_url = 'cannot load url' ctime = time.time() Tittle = element.a.get_text() further_url = element.a["href"] abstract = element.p.get_text() if (DBkeywords.checktrustful(further_url)): print "trust url:" + str(further_url) continue print "working on:" + further_url # browser.set_page_load_timeout(120) browser.get(getextraweb.checkhttp(further_url)) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" time.sleep(30) body = browser.page_source test_DB.bing(ID[0], Tittle, further_url, abstract, body, currentdate.getdate(), ac) counserat = counserat + 1 # except AttributeError: # print str('AttributeError') # # test_DB.suspect('bing',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) # # browser.quit() # continue except Exception, e: print str(e) # browser.quit() test_DB.suspect('bing', -1, ID[0], further_url, 3, currentdate.getdate(), str(e), Tittle, abstract) continue
def loadkeyword(): print "loading keywords from local to DB" # myfile = open("keywords.txt", "r") # myfile.read() with open("keywords_1.txt") as f: keywordsinput = f.readlines() # print keywordsinput[0] for keywords in keywordsinput: # cursor = db.cursor() try: print keywords.replace('\n', '') cursor.execute("INSERT INTO keyword(keyword,date) VALUES(%s,%s)", (keywords.replace('\n', ''), currentdate.getdate())) db.commit() except Exception, e: print str(e) print "caught exception:" + keywords.replace( '\n', '') + ":" + currentdate.getdate() db.rollback()
def checktrustful(url): try: print keywords.replace('\n', '') cursor.execute("SELECT website FROM trustfulwebsites") db.commit() except Exception, e: print str(e) print "caught exception:" + keywords.replace( '\n', '') + ":" + currentdate.getdate() db.rollback()