예제 #1
0
    def startParsing(self):
        soup = BeautifulSoup(self.htmlStr, "html.parser")

        for a in soup.find_all('div', class_='row'):
            for one_span in a.find_all('span', class_="pagina"):
                t = one_span.find_all('a', href=True)
                if not t:
                    self.currentPage = one_span.getText()
                else:
                    if self.currentPage is not None:
                        self.nextLinks.append(t[0]['href'])

            # Identifier, data and tags
            for tags in a.find_all('div', class_='col-xs-6'):
                for links in tags.find_all('a', href=True):
                    if links.text.strip().startswith('#'):
                        currentHistory = History(links.text.strip()[1:])
                        currentHistory.setURL(self.baseURL + links['href'])
                    elif self.datePattern.match(links.text.strip()):
                        currentHistory.setHistoryTime(links.text.strip())
                    if links['href'].startswith("/bytag"):
                        currentHistory.addTag(links.text)

            for tags in a.find_all('div',
                                   class_="col-xs-12",
                                   style="margin:0.5em 0;line-height:1.785em"):
                # This is text of post
                #print(tags.text)
                currentHistory.setHistory(tags.get_text().strip())
            for tags in a.find_all('div',
                                   class_="col-xs-12",
                                   style='text-align:center'):
                #print(tags.b.prettify())
                currentHistory.setVotes(tags.b.get_text())
                self.list_of_histories.append(currentHistory)