def download_articles(self): """Searches database table rawurl for undownloaded urls (state = -1), downloads and stores them """ undownloaded_rawurls = self.session.query(Rawurl).filter_by(state = Rawurl.STATE_NOT_DOWNLOADED).all() for rawurl in undownloaded_rawurls: #download pagehtml = my_urlopen(rawurl.url) #store soup = BeautifulSoup(pagehtml) rawurl.html_content = str(soup) rawurl.download_date = date_get_today() rawurl.state = Rawurl.STATE_UNPARSED self.session.commit()
def get_date_from_soup(self, soup): try: datetag = soup.find('div', attrs={'class': 'time'}) if datetag is None: datetag = soup.find('p', attrs={'class': 'autor_line'}).contents[0] date = datetag.text date = date.split(" ") if date[1]=='dnes': article_date = date_get_today() elif date[1]=='včera': article_date = date_get_yesterday() else: date = date[1:4] date = " ".join(date) article_date = datetime.datetime.strptime(date, "%d. %m. %Y").date() return article_date except Exception: return None
def store_link(self, link, htmlcontent): """Stores a link, with htmlcontent in db, with current date """ item = Rawurl(link, Rawurl.STATE_UNPARSED, htmlcontent, date_get_today()) self.session.add(item) return item