def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" % (self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles link_list = driver.find_element_by_class_name( "fallsFlow").find_elements_by_css_selector( "li[class=\"item masonry-brick\"]") for elem in link_list: article = elem.find_element_by_tag_name("h3") title = article.text # article title if title not in list: LOGGER.debug("article title %s" % (title)) # print title url = article.find_element_by_tag_name("a").get_attribute( "href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_tag_name("h5").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) # driver = webdriver.Firefox() LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) list = [] #extract url list stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 js = "var q=document.documentElement.scrollTop=8000" driver.execute_script(js) driver.implicitly_wait(0) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) driver.implicitly_wait(0) # main_section = driver.find_element_by_css_selector("div[class=\"footer\"]") # link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]") link_list = driver.find_element_by_class_name("page1").find_elements_by_class_name("row") # link_list = link_list + link_list2 # link_list = main_section.find_elements_by_tag_name("h2") print len(link_list) for elem in link_list: title = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").text # article title if title not in list: LOGGER.debug("article title %s"%(title)) url = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_class_name("f4").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) # driver = webdriver.Firefox() LOGGER.debug("start extractor from %s" % (self.url, )) driver.get(self.url) list = [] #extract url list stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 js = "var q=document.documentElement.scrollTop=8000" driver.execute_script(js) driver.implicitly_wait(0) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) driver.implicitly_wait(0) link_list = driver.find_element_by_class_name( "list").find_elements_by_class_name("item") print len(link_list) for elem in link_list: title = elem.find_element_by_tag_name( "h1").find_element_by_tag_name("a").text # article title if title not in list: LOGGER.debug("article title %s" % (title)) url = elem.find_element_by_tag_name( "h1").find_element_by_tag_name("a").get_attribute( "href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_tag_name("p").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles link_list = driver.find_element_by_class_name("fallsFlow").find_elements_by_css_selector("li[class=\"item masonry-brick\"]") for elem in link_list: article = elem.find_element_by_tag_name("h3") title = article.text # article title if title not in list: LOGGER.debug("article title %s"%(title)) # print title url = article.find_element_by_tag_name("a").get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_tag_name("h5").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def run(self): """ run the extractor use dict """ try: LOGGER.debug("start the extractor") for elem in extractor_source_url_config: extractor = elem["extractor"](elem) extractor.extract_links() except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" % (self.url,)) driver.get(self.url) # scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # print driver.find_element_by_tag_name("body").text list = [] # extract url list i = 0 # page count stop_flag = True # republishdThre = 5 # find 5 duplicated article stop extractor urls republishedCount = 0 while i < 10 and stop_flag: # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles contents = driver.find_elements_by_class_name("list_f14d") for content in contents: link_list = content.find_elements_by_tag_name("li") for elem in link_list: hrefs = elem.find_elements_by_tag_name("a") title = hrefs[0].text # article title if title not in list: LOGGER.debug("article title %s" % (title)) url = hrefs[0].get_attribute("href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = "" # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue # load the next page next_page = driver.find_element_by_class_name("pages").find_elements_by_tag_name("a")[-1] next_page.click() driver.implicitly_wait(5) # print driver.find_element_by_tag_name("body").text i += 1 except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 # next = 0 # while next < 2: # #click the next button three times to get the full article list # # next_page = driver.find_element_by_class_name("HomeMore") # next_page_a = next_page.find_element_by_tag_name("a") # print next_page.text # next_page_a.click() # driver.implicitly_wait(5) # next += 1 while i < 3 and stop_flag: # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles link_list = driver.find_elements_by_css_selector("ul[class=\"pictxt block\"]")[i].find_elements_by_tag_name("li") for elem in link_list: article = elem.find_element_by_class_name("tit") title = article.text # article title if title not in list: LOGGER.debug("article title %s"%(title)) # print title url = article.find_element_by_tag_name("a").get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = elem.find_element_by_class_name("txt").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue # load the next page next_page = driver.find_element_by_class_name("HomeMore").find_element_by_tag_name("a") next_page.click() driver.implicitly_wait(5) i += 1 except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # print driver.find_element_by_tag_name("body").text list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 while i < 10 and stop_flag: # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles contents = driver.find_elements_by_class_name("list_txt") for content in contents: link_list = content.find_elements_by_tag_name("li") for elem in link_list: hrefs = elem.find_elements_by_tag_name("a") title = hrefs[1].text # article title if title not in list: LOGGER.debug("article title %s"%(title)) # print title url = hrefs[1].get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = "" # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue # load the next page next_page = driver.find_elements_by_class_name("bar_pages_flip")[1] next_page.click() driver.implicitly_wait(5) # print driver.find_element_by_tag_name("body").text i += 1 except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) # driver = webdriver.Firefox() LOGGER.debug("start extractor from %s" % (self.url, )) driver.get(self.url) f = file("html", "w") source = driver.page_source.encode("utf-8") f.write(source) f.close() list = [] #extract url list stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 js = "var q=document.documentElement.scrollTop=8000" driver.execute_script(js) driver.implicitly_wait(0) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) driver.implicitly_wait(0) # main_section = driver.find_element_by_css_selector("div[class=\"footer\"]") # link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]") link_list = driver.find_element_by_class_name( "d_list_txt").find_elements_by_tag_name("li") # link_list = link_list + link_list2 # link_list = main_section.find_elements_by_tag_name("h2") print len(link_list) for elem in link_list: title = elem.text # article title if title not in list: LOGGER.debug("article title %s" % (title)) url = elem.find_element_by_class_name( "c_tit").find_element_by_tag_name("a").get_attribute( "href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = "" # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())