def getPageCount(self): searchUrl = "http://www.hkn24.com/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&sc_view_code=&view_type=" soup = Soup.phantomjs(searchUrl) pageCount = int( re.sub("\D", "", soup.find('font', color='#333333').get_text())) return math.floor(pageCount / 20)
def getPageCount(self): searchUrl = "http://www.khanews.com/news/articleList.html?page=&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=2000.01.01&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&view_type=" soup = Soup.phantomjs(searchUrl) tbodys = soup.find("td", bgcolor="#FFFFFF") pageCount = int( re.sub("\D", "", tbodys.find("font", color="#333333").get_text())) return math.floor(pageCount / 20)
def getPageHrefs(self, count): searchUrl = "http://www.hkn24.com/news/articleList.html?page=" + str( count ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&sc_view_code=&view_type=" soup = Soup.phantomjs(searchUrl) searchBox = soup.find_all('td', class_='ArtList_Title') pageHrefs = [ "http://www.hkn24.com/news/" + x.a.get('href') for x in searchBox ] return pageHrefs
def getPageHrefs(self, count): searchUrl = "http://www.doctorsnews.co.kr/news/articleList.html?page=" + str( count ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_add_section_code=&sc_add_sub_section_code=&sc_add_serial_code=&sc_area=A&sc_level=&sc_m_level=&sc_article_type=&sc_view_level=&sc_sdate=" + self.startDate + "&sc_edate=" + self.endDate + "&sc_serial_number=&sc_word=" + self.query + "&sc_word2=&sc_andor=OR&sc_order_by=I&view_type=" soupArticle = Soup.phantomjs(searchUrl) articleHrefList = soupArticle.find_all("a", class_="news_list_title") pageHrefs = [ "http://www.doctorsnews.co.kr/news/" + x.get('href') for x in articleHrefList ] return pageHrefs
def getPageHrefs(self, count): searchUrl = "http://www.khanews.com/news/articleList.html?page=" + str( count ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=2000.01.01&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&view_type=" soup = Soup.phantomjs(searchUrl) searchBox = soup.find("td", bgcolor="#FFFFFF").find_all("font", color="#001DD0") pageHrefs = [ "http://www.khanews.com/news/" + x.parent.get('href') for x in searchBox ] return pageHrefs
def getPageCount(self): searchUrl = "http://www.doctorsnews.co.kr/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_add_section_code=&sc_add_sub_section_code=&sc_add_serial_code=&sc_area=A&sc_level=&sc_m_level=&sc_article_type=&sc_view_level=&sc_sdate=" + self.startDate + "&sc_edate=" + self.endDate + "&sc_serial_number=&sc_word=" + self.query + "&sc_word2=&sc_andor=OR&sc_order_by=I&view_type=" soup = Soup.phantomjs(searchUrl) articleCnt = soup.find("tr", height="35").td.get_text() maxArticle = int(re.sub("\D", "", articleCnt)) return math.floor(int(maxArticle / 25))
def getPageHrefs(self, count): searchUrl = "http://www.doctorstimes.com/news/articleList.html?page="+str(count)+"&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word="+self.query+"&sc_word2=&sc_andor=&sc_order_by=E&view_type=" soup = Soup.phantomjs(searchUrl,'html.parser') searchBox = soup.find_all('td',class_='list-titles list-pad-5') pageHrefs = ["http://www.doctorstimes.com/news/"+x.a.get('href') for x in searchBox] return pageHrefs
def getPageCount(self): searchUrl = "http://www.doctorstimes.com/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word="+self.query+"&sc_word2=&sc_andor=&sc_order_by=E&view_type=" soup = Soup.phantomjs(searchUrl) pageCount = int(re.sub("\D","",soup.select("#article-list > tbody > tr > td > table > tbody > tr:nth-of-type(1) > td > table > tbody > tr > td:nth-of-type(1)")[0].get_text())) return math.floor(pageCount/20)