def getarticle(self, headline, lines): article = Article(headline=headline) text = "" for line in lines[2:]: if len(line) > 2: text += "\n" + line text = text.replace("-\n", "") text = text.replace(" ", " ") text = text.replace("\n", " ") article.text = text date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})") result = date_pattern.search(lines[1]) article.date = date(int(result.group(3)), int(result.group(2)), int(result.group(1))) pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)") result = pagenum_pattern.search(lines[1]) if result: article.pagenr = int(result.group(1)) for h, medium in self.index: if article.headline.lower().strip() in h.lower().strip(): article.set_property("medium", self.get_medium(medium)) return article
def getarticle(self, headline, lines): article = Article(headline = headline) text = "" for line in lines[2:]: if len(line) > 2: text += "\n" + line text = text.replace("-\n","") text = text.replace(" "," ") text = text.replace("\n"," ") article.text = text date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})") result = date_pattern.search(lines[1]) article.date = date( int(result.group(3)), int(result.group(2)), int(result.group(1))) pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)") result = pagenum_pattern.search(lines[1]) if result: article.pagenr = int(result.group(1)) for h, medium in self.index: if article.headline.lower().strip() in h.lower().strip(): article.set_property("medium", self.get_medium(medium)) return article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = Article(metastring={}) article.metastring['html'] = div article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum( articlepage[0].text) article.medium = self.get_medium( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] else: raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.") for div in divs: article = Article(metastring=div.text_content()) article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0].text_content() articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content()) article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content()) date_str = div.cssselect("#articleDate")[0].text_content() try: article.date = readDate(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article