Exemplo n.º 1
0
def get_search(url, keyword, ac):
    browser = get_browser(binary=firefox_binary)
    browser.get(url)
    time.sleep(15)
    print "get_search url done"
    page_source = browser.page_source
    soup = BeautifulSoup(page_source, "lxml")
    [s.extract() for s in soup('script')]
    ID = DBkeywords.findkeywordID(keyword)
    resultofitems = soup.find_all("div", {"class": "c-container"})
    print "get_search baidu :" + url
    # while resultofitems==None :
    # 	print "searching page cannot load"
    # 	WebDriverWait(browser,30)
    # 	browser = get_browser(binary=firefox_binary)
    # 	browser.get(url)
    # 	resultofitems = soup.find_all("div",{"class":"c-container"})
    for item in resultofitems:
        try:
            time.sleep(15)
            Tittle = 'cannot load '
            abstract = 'cannot load '
            currenturl = 'cannot load url'
            further_url = item.a["href"]
            Tittle = item.a.get_text()
            print Tittle
            abstract = item.find_all("div")[0].get_text()
            print abstract
            # thread.start_new_thread(getextraweb.baidufurthersearch,(ID[0],Tittle,abstract,further_url,ac))
            browser.set_page_load_timeout(180)
            browser.get(further_url)
            try:
                WebDriverWait(browser, 20).until(
                    EC.presence_of_element_located(
                        browser.find_element_by_xpath("*/body")))
                print "Page is ready!"
            except TimeoutException:
                print "Loading took too much time!"
            time.sleep(30)
            currenturl = (browser.current_url)
            print currenturl
            body = browser.page_source
            test_DB.baidu(ID[0], Tittle, currenturl, abstract, body,
                          currentdate.getdate(), ac)
        # except AttributeError:
        # 	print str(AttributeError)
        # 	test_DB.suspect('baidu',-1,ID[0],currenturl,3,currentdate.getdate(),'AttributeError')
        # 	continue
        except TimeoutException, e2:
            print "Timeout, retrying..."
            test_DB.suspect('baidu', -1, ID[0], currenturl, 3,
                            currentdate.getdate(), str(e2), Tittle, abstract)
            time.sleep(30)
            continue
        except Exception, e:
            print "caught exception :site:" + currenturl + "keyword: " + keyword
            test_DB.suspect('baidu', -1, ID[0], currenturl, 3,
                            currentdate.getdate(), str(e), Tittle, abstract)
            print str(e)
            continue
Exemplo n.º 2
0
def baidufurthersearch(url):
    print "getextraweb:" + url
    currenturl = url
    try:
        browser = get_browser(binary=firefox_binary)
        print url
        # browser.get(checkhttp(url))
        browser.get(url)
        page_source = browser.page_source
        currenturl = browser.current_url
        soup = BeautifulSoup(page_source, "lxml")
        [s.extract() for s in soup('script')]
        body = cleanup(soup.get_text())
        browser.quit()
    except AttributeError:
        print str(AttributeError)
        test_DB.suspect('baidu', -1, ID[0], currenturl, 3,
                        currentdate.getdate(), 'AttributeError', Tittle,
                        abstract)
        # browser.quit()
        # sys.exit()
    except Exception, e:
        print "caught exception :site:" + currenturl + "keyword: " + keyword
        test_DB.suspect('baidu', -1, ID[0], currenturl, 3,
                        currentdate.getdate(), str(e), Tittle, abstract)
        print str(e)
Exemplo n.º 3
0
def get_search(url, keyword, ac):
    browser = get_browser(binary=firefox_binary)
    browser.get(url)
    counserat = 0
    page_source = browser.page_source
    soup = BeautifulSoup(page_source, "lxml")
    [s.extract() for s in soup('script')]
    ID = DBkeywords.findkeywordID(keyword)
    resultofitems = soup.find("div", {
        "class": "l-web-results"
    }).find_all("div", {"class": "web-result"})
    for item in resultofitems:
        try:
            print str(counserat)
            # abstract = element.find("span",{"class":"st"}).get_text()
            Tittle = item.find("h3", {"class": "web-result-title"}).get_text()
            further_url = 'cannot load url'
            further_url = item.find("p", {
                "class": "web-result-url"
            }).get_text()
            abstract = item.find("p", {
                "class": "web-result-description"
            }).get_text()
            if (DBkeywords.checktrustful(further_url)):
                print "trust url:" + str(further_url)
                continue
            print "working on:" + further_url
            # browser.set_page_load_timeout(180)
            browser.get(getextraweb.checkhttp(further_url))
            # try:
            # 	WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body")))
            # 	print "Page is ready!"
            # except TimeoutException:
            # 	print "Loading took too much time!"
            body = browser.page_source
            test_DB.search(ID[0], Tittle, further_url, abstract, body,
                           currentdate.getdate(), ac)
            counserat = counserat + 1
        except AttributeError:
            print str('AttributeError')
            test_DB.suspect('search', -1, ID[0], further_url, 3,
                            currentdate.getdate(), 'AttributeError', Tittle,
                            abstract)
            continue
        except Exception, e:
            print "caught exception :site:" + further_url + "keyword: " + keyword
            test_DB.suspect('search', -1, ID[0], further_url, 3,
                            currentdate.getdate(), str(e), Tittle, abstract)
            print str(e)
            continue
Exemplo n.º 4
0
def get_search(url,keyword,ac):
	browser = get_browser(binary=firefox_binary)
	browser.get(url)
	WebDriverWait(browser,30)
	page_source =browser.page_source
	soup = BeautifulSoup(page_source,"lxml")
	[s.extract() for s in soup('script')]
	ID =DBkeywords.findkeywordID(keyword)
	resultofitems = soup.find("ul",{"content":"ALGO"}).find_all("li")
	Tittle ='cannot load'
	abstract ='cannot load'
	for item in resultofitems:
		try:
			# abstract = element.find("span",{"class":"st"}).get_text()
			print "start"
			further_url ='cannot load url'
			Tittle = item.a.get_text()
			further_url = item.a["href"]
			print further_url
			abstract = item.find("p",{"property":"f:desc"}).get_text()
			if(DBkeywords.checktrustful(further_url)):
				print "trust url:"+str(further_url)
				continue
			print "working on:"+further_url
			WebDriverWait(browser,30)
			# browser.set_page_load_timeout(180)
			browser.get(getextraweb.checkhttp(further_url))
			# try:
			# 	WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body")))
			# 	print "Page is ready!"
			# except TimeoutException:
			# 	print "Loading took too much time!"
			currenturl = (browser.current_url)
			# soupfurther = BeautifulSoup(browser.page_source,"lxml")
			# [s.extract() for s in soupfurther('script')]
			# body = cleanup(soupfurther.get_text())
			body = browser.page_source
			test_DB.aol(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac)
		except AttributeError:
			print str(AttributeError)
			test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract)
			continue
		except Exception, e:
			print "caught exception :site:"+further_url +"keyword: "+keyword
			test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract)
			print str(e)
			continue
Exemplo n.º 5
0
def get_search(url,keyword,ac):
	browser = get_browser(binary=firefox_binary)
	browser.get(url)
	page_source =browser.page_source
	soup = BeautifulSoup(page_source,"lxml")
	[s.extract() for s in soup('script')]
	ID =DBkeywords.findkeywordID(keyword)
	resultofitems = soup.find_all("div",{"class":"PartialSearchResults-item"})
	for item in resultofitems:
		try:
			time.sleep(10)
			further_url ='cannot load url'
			Tittle ='cannot load url'
			abstract = 'cannot load url'
			Tittle = item.find("div",{"class":"PartialSearchResults-item-title"}).get_text()
			print Tittle
			further_url = item.find("p",{"class":"PartialSearchResults-item-url"}).get_text()
			print further_url
			abstract = item.find("p",{"class":"PartialSearchResults-item-abstract"}).get_text()
			print abstract
			if(DBkeywords.checktrustful(further_url)):
				print "trust url:"+str(further_url)
				continue
			print "working on:"+further_url
			# browser.set_page_load_timeout(180)
			browser.get(getextraweb.checkhttp(further_url))
			# soupfurther = BeautifulSoup(browser.page_source,"lxml")
			# [s.extract() for s in soupfurther('script')]
			# # body = cleanup(soupfurther.get_text())
			# try:
			# 	WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body")))
			# 	print "Page is ready!"
			# except TimeoutException:
			# 	print "Loading took too much time!"
			time.sleep(30)
			body = browser.page_source
			test_DB.ask(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac)
		# except AttributeError:
		# 	print str('AttributeError')
		# 	test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract)
		# 	continue
		except Exception, e:
			print "caught exception :site:"+further_url +"keyword: "+keyword
			test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract)
			print str(e)
			continue
Exemplo n.º 6
0
def get_search(url, keyword, ac):
    browser = get_browser(binary=firefox_binary)
    browser.get(url)
    page_source = browser.page_source
    soup = BeautifulSoup(page_source, "lxml")
    [s.extract() for s in soup('script')]
    letters = soup.find_all("li", {"class": "b_algo"})
    counserat = 0
    ID = DBkeywords.findkeywordID(keyword)
    for element in letters:
        try:
            print str(counserat)
            further_url = 'cannot load url'
            ctime = time.time()
            Tittle = element.a.get_text()
            further_url = element.a["href"]
            abstract = element.p.get_text()
            if (DBkeywords.checktrustful(further_url)):
                print "trust url:" + str(further_url)
                continue
            print "working on:" + further_url
            # browser.set_page_load_timeout(120)
            browser.get(getextraweb.checkhttp(further_url))
            # try:
            # 	WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body")))
            # 	print "Page is ready!"
            # except TimeoutException:
            # 	print "Loading took too much time!"
            time.sleep(30)
            body = browser.page_source
            test_DB.bing(ID[0], Tittle, further_url, abstract, body,
                         currentdate.getdate(), ac)
            counserat = counserat + 1
        # except AttributeError:
        # 	print str('AttributeError')
        # 	# test_DB.suspect('bing',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract)
        # 	# browser.quit()
        # 	continue
        except Exception, e:
            print str(e)
            # browser.quit()
            test_DB.suspect('bing', -1, ID[0], further_url, 3,
                            currentdate.getdate(), str(e), Tittle, abstract)
            continue
Exemplo n.º 7
0
def loadkeyword():
    print "loading keywords from local to DB"
    # myfile = open("keywords.txt", "r")
    # myfile.read()
    with open("keywords_1.txt") as f:
        keywordsinput = f.readlines()
    # print keywordsinput[0]
    for keywords in keywordsinput:
        # cursor = db.cursor()
        try:
            print keywords.replace('\n', '')
            cursor.execute("INSERT INTO keyword(keyword,date) VALUES(%s,%s)",
                           (keywords.replace('\n', ''), currentdate.getdate()))
            db.commit()
        except Exception, e:
            print str(e)
            print "caught exception:" + keywords.replace(
                '\n', '') + ":" + currentdate.getdate()
            db.rollback()
Exemplo n.º 8
0
def checktrustful(url):
    try:
        print keywords.replace('\n', '')
        cursor.execute("SELECT website FROM trustfulwebsites")
        db.commit()
    except Exception, e:
        print str(e)
        print "caught exception:" + keywords.replace(
            '\n', '') + ":" + currentdate.getdate()
        db.rollback()