def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = Article(metastring={}) article.metastring['html'] = div article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum( articlepage[0].text) article.medium = self.get_medium( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def getarticle(self, headline, lines): article = Article(headline=headline) text = "" for line in lines[2:]: if len(line) > 2: text += "\n" + line text = text.replace("-\n", "") text = text.replace(" ", " ") text = text.replace("\n", " ") article.text = text date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})") result = date_pattern.search(lines[1]) article.date = date(int(result.group(3)), int(result.group(2)), int(result.group(1))) pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)") result = pagenum_pattern.search(lines[1]) if result: article.pagenr = int(result.group(1)) for h, medium in self.index: if article.headline.lower().strip() in h.lower().strip(): article.set_property("medium", self.get_medium(medium)) return article
def getarticle(self, headline, lines): article = Article(headline = headline) text = "" for line in lines[2:]: if len(line) > 2: text += "\n" + line text = text.replace("-\n","") text = text.replace(" "," ") text = text.replace("\n"," ") article.text = text date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})") result = date_pattern.search(lines[1]) article.date = date( int(result.group(3)), int(result.group(2)), int(result.group(1))) pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)") result = pagenum_pattern.search(lines[1]) if result: article.pagenr = int(result.group(1)) for h, medium in self.index: if article.headline.lower().strip() in h.lower().strip(): article.set_property("medium", self.get_medium(medium)) return article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] else: raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.") for div in divs: article = Article(metastring=div.text_content()) article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0].text_content() articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content()) article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content()) date_str = div.cssselect("#articleDate")[0].text_content() try: article.date = readDate(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text = res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def _scrape_unit(self, _file): """unit: a pdf document""" res = "" parser = PDFParser() doc = parser.load_document(_file, self.options['pdf_password']) for page in parser.process_document(doc): page_txt = "" for line in parser.get_textlines(page): page_txt += line.get_text() + "\n" res += page_txt + "\n\n" article = Article(text=res) article.headline = self.getheadline(_file) article.medium = self.options['medium'] article.section = self.options['section'] if self.options['date']: article.date = self.options['date'] else: article.date = date.today() yield article
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = read_date(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): article.title = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = read_date(datestr) if ( article.date - file_date ).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = read_date(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.set_property("medium", medium_str) article.set_property("section", data[1]) paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = read_date(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): article.title = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = read_date(datestr) if ( article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = read_date(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.set_property("medium", medium_str) article.set_property("section", data[1]) paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode('latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub( "=[A-Z0-9]{2}", character, article.text) yield article
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = readDate(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring = {'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): #headline article.headline = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = readDate(datestr) if (article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years = 1) else: article.date = readDate(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.medium = Medium.get_or_create(medium_str) article.section = data[1] paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content break yield article