def parse_item(self, item): #item: a list of html tags article = Article(metastring={}) article.text = self._parse_text(item) for tag in item: if tag.tag == "h2": if tag.text: article.headline = tag.text else: article.headline = tag.cssselect("span")[0].text_content() elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")): article = self.parse_dateline(tag.text_content(), article) if not article.headline: raise Exception("Article has no headline") return article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = Article(metastring={}) article.metastring['html'] = div article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum( articlepage[0].text) article.medium = self.get_medium( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] else: raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.") for div in divs: article = Article(metastring=div.text_content()) article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0].text_content() articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content()) article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content()) date_str = div.cssselect("#articleDate")[0].text_content() try: article.date = readDate(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def parse_item(self, item): #item: a list of html tags article = Article() article.text = self._parse_text(item) headline_found = False dateline_found = False for tag in item: if tag.tag == "h2" and not headline_found: if tag.text: article.headline = tag.text else: article.headline = tag.cssselect("span")[0].text_content() headline_found = True elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")) and not dateline_found: article = self.parse_dateline(tag.text_content(), article) dateline_found = True if not article.headline: raise Exception("Article has no headline") return article
def parse_item(self, item): #item: a list of html tags article = Article(metastring={}) article.text = self._parse_text(item) for tag in item: if tag.tag == "h2": article.headline = tag.text elif tag.tag == "i": article = self.parse_dateline(tag.text_content(), article) return article
def parse_item(self, item): #item: a list of html tags article = Article(metastring = {}) for tag in item: if tag.tag in ("p","div"): if not (hasattr(article,'text') or article.text): article.text.append(tag) else: article.text = [tag] elif tag.tag == "h2": article.headline = tag.text elif tag.tag == "i": article = self.parse_dateline(tag.text, article) #process html article.text = "\n".join([html2text(html.tostring(bit)) for bit in article.text]) return article
def parse_item(self, item): #item: a list of html tags article = Article(metastring={}) for tag in item: if tag.tag in ("p", "div"): if not (hasattr(article, 'text') or article.text): article.text.append(tag) else: article.text = [tag] elif tag.tag == "h2": article.headline = tag.text elif tag.tag == "i": article = self.parse_dateline(tag.text_content(), article) #process html article.text = "\n".join( [html2text(html.tostring(bit)) for bit in article.text]) return article
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text = res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text=res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def scrape_3(self, _html): """Some ugly MS Word format, as of 2014-03-03""" # Partition articles part = [] articles = [] for tag in _html.cssselect("body > div > *"): if tag.cssselect("hr"): articles.append(part) part = [] else: part.append(tag) for tags in articles[1:]: article = Article() dateline = tags[1].text_content().strip() article = self.parse_dateline(dateline, article) article.headline = tags[1].text_content().strip() html_str = "".join([html.tostring(t) for t in tags[2:]]) article.text = html2text(html_str) article.metastring = {'html': html_str} yield article
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if ( article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode('latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub( "=[A-Z0-9]{2}", character, article.text) yield article
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if ( article.date - file_date ).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article