def _is_date(string): try: toolkit.read_date(string) except ValueError: return False return True
def _is_date(string, language_pool=None): if not re.search("\d", string): return False # no number = no date, optimizatino because dateparse is very slow on non-matches try: toolkit.read_date(string, language_pool=language_pool) except ValueError: return False return True
def _is_date(string): if not re.search("\d", string): return False # no number = no date, optimizatino because dateparse is very slow on non-matches try: toolkit.read_date(string) except ValueError: return False return True
def parse_dateline(cls, text, article): bits = text.split() if "-" in bits[-1]: article["date"] = read_date(bits[-1]) article["medium"] = cls.get_medium(" ".join(bits[:-1])) elif bits[-1].isdigit() and bits[-3].isdigit(): article["date"] = read_date(" ".join(bits[-3:])) article["medium"] = cls.get_medium(" ".join(bits[:-3])) else: article["medium"] = cls.get_medium(" ".join(bits)) article["date"] = None return article
def parse_dateline(cls, text, article): bits = text.split() if not bits: raise ParseError("Couldn't find date in article: {}".format(article['title'])) if "-" in bits[-1]: article["date"] = read_date(bits[-1]) article["medium"] = cls.get_medium(" ".join(bits[:-1])) elif bits[-1].isdigit() and bits[-3].isdigit(): article["date"] = read_date(" ".join(bits[-3:])) article["medium"] = cls.get_medium(" ".join(bits[:-3])) else: article["medium"] = cls.get_medium(" ".join(bits)) article["date"] = None return article
def test_readdate(self): for s, date in ( ("22 maart 1980", datetime.datetime(1980, 3, 22, 0, 0, 0)), ("22 mrt 1980", datetime.datetime(1980, 3, 22, 0, 0, 0)), ("22/3/1980", datetime.datetime(1980, 3, 22, 0, 0, 0)), ("1980-05-02", datetime.datetime(1980, 5, 2, 0, 0, 0)), ("1980-3-22", datetime.datetime(1980, 3, 22, 0, 0, 0)), ("1980-3-22T01:00:05", datetime.datetime(1980, 3, 22, 1, 0, 5)), ("1980-3-22 01:00", datetime.datetime(1980, 3, 22, 1, 0, 0)), ("1980-3-22 01:00 PM", datetime.datetime(1980, 3, 22, 13, 0, 0)), ("1/1/98", datetime.datetime(1998, 1, 1, 0, 0, 0)), ("1/1/04", datetime.datetime(2004, 1, 1, 0, 0, 0)), ("31/12/72", datetime.datetime(1972, 12, 31, 0, 0, 0)), ("1/2/1972", datetime.datetime(1972, 2, 1, 0, 0, 0)), ("30.09.2008", datetime.datetime(2008, 9, 30, 0, 0, 0)), ("02.09.2008", datetime.datetime(2008, 9, 2, 0, 0, 0)), ("30-09-2008", datetime.datetime(2008, 9, 30, 0, 0, 0)), ("02-09-2008", datetime.datetime(2008, 9, 2, 0, 0, 0)), ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0)), ("March 31, 2003", datetime.datetime(2003, 3, 31, 0, 0, 0)), ("December 31, 2009 Thursday", datetime.datetime(2009, 12, 31, 0, 0, 0)), (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0)), ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0)), ('September 1, 2008 Monday 12:44 PM AEST', datetime.datetime(2008, 9, 1, 12, 44)), ('23aug2013', datetime.datetime(2013, 8, 23, 0, 0, 0)), ): date2 = toolkit.read_date(s) self.assertEqual(date, date2, "while parsing {}".format(repr(s)))
def test_readdate(self): for s, date in ( ("22 maart 1980" , datetime.datetime(1980, 3, 22,0,0,0)), ("22 mrt 1980" , datetime.datetime(1980, 3, 22,0,0,0)), ("22/3/1980" , datetime.datetime(1980, 3, 22,0,0,0)), ("1980-05-02" , datetime.datetime(1980, 5, 2,0,0,0)), ("1980-3-22" , datetime.datetime(1980, 3, 22,0,0,0)), ("1980-3-22T01:00:05" , datetime.datetime(1980, 3, 22,1,0,5)), ("1980-3-22 01:00" , datetime.datetime(1980, 3, 22,1,0,0)), ("1980-3-22 01:00 PM" , datetime.datetime(1980, 3, 22,13,0,0)), ("1/1/98", datetime.datetime(1998, 1, 1,0,0,0)), ("1/1/04", datetime.datetime(2004, 1, 1,0,0,0)), ("31/12/72", datetime.datetime(1972, 12, 31,0,0,0)), ("1/2/1972", datetime.datetime(1972, 2, 1,0,0,0)), ("30.09.2008", datetime.datetime(2008, 9, 30,0,0,0)), ("02.09.2008", datetime.datetime(2008, 9, 2,0,0,0)), ("30-09-2008", datetime.datetime(2008, 9, 30,0,0,0)), ("02-09-2008", datetime.datetime(2008, 9, 2,0,0,0)), ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0)), ("March 31, 2003", datetime.datetime(2003, 3, 31, 0, 0, 0)), ("December 31, 2009 Thursday", datetime.datetime(2009, 12, 31, 0, 0, 0)), (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0)), ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0)), ('September 1, 2008 Monday 12:44 PM AEST', datetime.datetime(2008, 9, 1, 12, 44)), ('23aug2013', datetime.datetime(2013, 8, 23, 0, 0, 0)), ): date2 = toolkit.read_date(s) self.assertEqual(date, date2, "while parsing {}".format(repr(s)))
def scrape_1(cls, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] else: raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.") for div in divs: article = {"html": div.text_content()} article["title"] = div.cssselect("#articleTitle")[0].text_content() article["text"] = div.cssselect("#articleIntro")[0].text_content() articlepage = div.cssselect("#articlePage") if articlepage: article["pagenr"], article["section"] = cls.get_pagenum(articlepage[0].text_content()) article["medium"] = cls.get_medium(div.cssselect("#sourceTitle")[0].text_content()) date_str = div.cssselect("#articleDate")[0].text_content() try: article["date"] = read_date(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def scrape_2(cls, _html): """New format as of 2014 and a few days before""" title = _html.cssselect("h1")[0] if not title.text: title = title.cssselect("span")[0] docdate = read_date(title.text.split("-")[1]) # split body by <hr> items = [] item = [] if len(_html.cssselect("body > hr")) == 0: # select MS Word div wrapper tags = _html.cssselect("body > div.WordSection1 > *") if len(tags) == 0: raise ParseError("Document format is not supported") else: tags = _html.cssselect("body > *") for child in tags: if child.tag == "hr" or (child.tag == "div" and child.cssselect("span > hr")): items.append(item) item = [] else: item.append(child) # first item is the index items = items[1:] for item in items: article = cls.parse_item(item) if not article["date"]: article["date"] = docdate yield article
def scrape_2(cls, _html): """New format as of 2014 and a few days before""" title = _html.cssselect("h1")[0] if not title.text: title = title.cssselect("span")[0] docdate = read_date(title.text.split("-")[1]) # split body by <hr> items = [] item = [] if len(_html.cssselect("body > hr")) == 0: # select MS Word div wrapper tags = _html.cssselect("body > div.WordSection1 > *") if len(tags) == 0: raise ParseError("Document format is not supported") else: tags = _html.cssselect("body > *") for child in tags: if child.tag == "hr" or (child.tag == "div" and child.cssselect("span > hr")): items.append(item) item = [] else: item.append(child) # first item is the index items = items[1:] for item in items: article = cls.parse_item(item) if not article["date"]: article["date"] = docdate yield article
def scrape_1(cls, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] else: raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.") for div in divs: article = {"html": div.text_content()} article["title"] = div.cssselect("#articleTitle")[0].text_content() article["text"] = div.cssselect("#articleIntro")[0].text_content() articlepage = div.cssselect("#articlePage") if articlepage: article["pagenr"], article["section"] = cls.get_pagenum(articlepage[0].text_content()) article["medium"] = cls.get_medium(div.cssselect("#sourceTitle")[0].text_content()) date_str = div.cssselect("#articleDate")[0].text_content() try: article["date"] = read_date(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def parse_document(self, file): dirname, filename = os.path.split(file.name) filename, ext = os.path.splitext(filename) metadata = dict((k, v) for (k,v) in self.options.items() if k in ["medium", "headline", "project", "date", "section"]) if not metadata["date"]: datestring, filename = filename.split("_", 1) metadata["date"] = toolkit.read_date(datestring) if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["section"].strip(): metadata["section"] = dirname convertors = None if ext.lower() == ".docx": convertors = [_convert_docx, _convert_doc] elif ext.lower() == ".doc": convertors = [_convert_doc, _convert_docx] if convertors: text = _convert_multiple(file, convertors) else: text = file.text return Article(text=text, **metadata)
def test_post(self): """Test whether posting and retrieving an article works correctly""" a = test_article() res = self._post_articles(a) self.assertEqual(set(res.keys()), {'id'}) # POST should only return IDs res = self._get_article(aid=res['id']) self.assertEqual(res["title"], a['title']) self.assertEqual(toolkit.read_date(res["date"]), toolkit.read_date(a['date'])) self.assertNotIn("text", res.keys()) self.assertIsNotNone(res["hash"]) res = self._get_article(aid=res['id'], text=True) self.assertEqual(res["text"], a['text']) res = self._get_articles()["results"] self.assertEqual(len(res), 1)
def test_post(self): """Test whether posting and retrieving an article works correctly""" a = test_article() res = self._post_articles(a) self.assertEqual(set(res.keys()), {'id'}) # POST should only return IDs res = self._get_article(aid=res['id']) self.assertEqual(res["title"], a['title']) self.assertEqual(toolkit.read_date(res["date"]), toolkit.read_date(a['date'])) self.assertNotIn("text", res.keys()) self.assertIsNotNone(res["hash"]) res = self._get_article(aid=res['id'], text=True) self.assertEqual(res["text"], a['text']) res = self._get_articles()["results"] self.assertEqual(len(res), 1)
def parse_page(doc_elements): """Parses an APA page given in a list of Etree elements.""" doc, elements = doc_elements elements = [e for e in elements if not isinstance(e, lxml.html.HtmlComment)] headline = set(get_descendants(doc.cssselect("b"))) & set(elements) meta = (set(get_descendants(doc.cssselect("i"))) & set(elements)) - headline text = set(elements) - (headline | meta) headline = sorted(get_roots(headline), key=lambda e:elements.index(e)) if not headline: raise ValueError("No possible headlines found.") remove_tree(meta, ["b"]) remove_tree(text, ["b", "i"]) # Some text in italics is no metadata. We only use text before headline elements # for metadata. lesser_than_headline = lambda e:elements.index(e) <= elements.index(headline[0]) meta = get_nonempty(filter(lesser_than_headline, meta)) # Parse metadata metadata = {} for el in list(meta): if get_metadata(metadata, el): meta.remove(el) if meta: metadata["byline"] = " - ".join(m.text for m in meta) # Convert date properties to datetime object year, month, day = metadata["year"], metadata["month"], metadata["day"] hour, minute = metadata.get("hour"), metadata.get("minute") datestring = "{day} {month} {year}" if hour is not None and minute is not None: datestring += ", {hour}:{minute}" metadata["date"] = read_date(datestring.format(**locals())) for prop in ("year", "month", "day", "hour", "minute"): if prop in metadata: del metadata[prop] # Clean data and get headline metadata["medium"] = metadata.get("medium", "APA - Unknown").strip().strip('"') medium, headline = metadata["medium"], "".join(["".join(e.itertext()) for e in headline]) if medium in headline: headline = headline.split("-", medium.count("-") + 1)[-1] metadata["headline"] = headline # Get text. Since ordering is lost in sets, restore original order of elements return metadata, "".join(get_text(sorted(text, key=lambda e:elements.index(e)))).strip()
def parse_page(doc_elements): """Parses an APA page given in a list of Etree elements.""" doc, elements = doc_elements elements = [e for e in elements if not isinstance(e, lxml.html.HtmlComment)] headline = set(get_descendants(doc.cssselect("b"))) & set(elements) meta = (set(get_descendants(doc.cssselect("i"))) & set(elements)) - headline text = set(elements) - (headline | meta) headline = sorted(get_roots(headline), key=lambda e: elements.index(e)) if not headline: raise ValueError("No possible headlines found.") remove_tree(meta, ["b"]) remove_tree(text, ["b", "i"]) # Some text in italics is no metadata. We only use text before headline elements # for metadata. lesser_than_headline = lambda e: elements.index(e) <= elements.index(headline[0]) meta = get_nonempty(filter(lesser_than_headline, meta)) # Parse metadata metadata = {} for el in list(meta): if get_metadata(metadata, el): meta.remove(el) if meta: metadata["byline"] = " - ".join(m.text for m in meta) # Convert date properties to datetime object year, month, day = metadata["year"], metadata["month"], metadata["day"] hour, minute = metadata.get("hour"), metadata.get("minute") datestring = "{day} {month} {year}" if hour is not None and minute is not None: datestring += ", {hour}:{minute}" metadata["date"] = read_date(datestring.format(**locals())) for prop in ("year", "month", "day", "hour", "minute"): if prop in metadata: del metadata[prop] # Clean data and get headline metadata["medium"] = metadata.get("medium", "APA - Unknown").strip().strip('"') medium, headline = metadata["medium"], "".join(["".join(e.itertext()) for e in headline]) if medium in headline: headline = headline.split("-", medium.count("-") + 1)[-1] metadata["headline"] = headline # Get text. Since ordering is lost in sets, restore original order of elements return metadata, "".join(get_text(sorted(text, key=lambda e: elements.index(e)))).strip()
def test_get(self): p1 = amcattest.create_test_project(name="testnaam", description="testdescription", insert_date='2012-01-01') actual = self.get(ProjectResource, id=p1.id) actual_results = actual.pop("results") self.assertEqual(len(actual_results), 1) actual_results = actual_results[0] date = actual_results.pop('insert_date') read_date(date) # check valid date, not much more to check here? expected_results = { 'insert_user': p1.insert_user.id, 'description': 'testdescription', 'name': u'testnaam', 'guest_role': 11, 'owner': p1.owner.id, 'active': True, 'id': p1.id, 'last_visited_at': "Never", 'favourite': False, "display_columns": [], "r_plugins_enabled": False, } expected_meta = { 'page': 1, 'next': None, 'previous': None, 'per_page': 10, 'total': 1, 'pages': 1, 'echo': None, } self.assertDictsEqual(actual, expected_meta) self.assertDictsEqual(actual_results, expected_results)
def parse_file(self, file: UploadedFile, _data): for fields in _data: data = {f["path"]: f["content"] for f in fields} art = {} for field, setting in self.options['field_map'].items(): datatype = get_property_primitive_type(field) value, typ = setting['value'], setting['type'] val = data.get(value) if typ == 'field' else value if val: if datatype is datetime.datetime and type(val) is str: val = toolkit.read_date(val) art[field] = val yield Article(**art)
def parse_file(self, file: model_UploadedFile, data): self.ln_query, arts = data for data in arts: art = {} for field, setting in self.options['field_map'].items(): datatype = get_property_primitive_type(field) value, typ = setting['value'], setting['type'] val = data.get(value) if typ == 'field' else value if val: if datatype is datetime.datetime and type(val) is str: val = toolkit.read_date(val) art[field] = val yield Article(**art)
def parse_file(self, file, encoding, data): self.ln_query, arts = data for data in arts: art = {} for field, setting in self.options['field_map'].items(): datatype = get_property_primitive_type(field) value, typ = setting['value'], setting['type'] val = data.get(value) if typ == 'field' else value if val: if datatype is datetime.datetime and type(val) is str: val = toolkit.read_date(val) art[field] = val yield Article(**art)
def test_get(self): p1 = amcattest.create_test_project(name="testnaam", description="testdescription", insert_date='2012-01-01') actual = self.get(ProjectResource, id=p1.id) actual_results = actual.pop("results") self.assertEqual(len(actual_results), 1) actual_results = actual_results[0] date = actual_results.pop('insert_date') read_date(date) # check valid date, not much more to check here? expected_results = { 'insert_user': p1.insert_user.id, 'description': 'testdescription', 'name': u'testnaam', 'guest_role': 11, 'owner': p1.owner.id, 'active': True, 'id': p1.id, 'last_visited_at': "Never", 'favourite': False, "r_plugins_enabled": False, } expected_meta = { 'page': 1, 'next': None, 'previous': None, 'per_page': 10, 'total': 1, 'pages': 1, 'echo': None, } self.assertDictsEqual(actual, expected_meta) self.assertDictsEqual(actual_results, expected_results)
def parse_online_article(art): # First, test for online articles with specific format blocks = re.split("\n *\n\s*", _strip_article(art)) if len(blocks) != 6: return medium, url, datestr, headline, nwords, lead = blocks if not (url.startswith("http://") or url.startswith("https://")): return if lead.startswith("Bewaar lees artikel"): lead = lead[len("Bewaar lees artikel"):] if not re.match("(\d+) words", nwords): return date = toolkit.read_date(datestr) return headline.strip(), None, lead.strip(), date, medium, {"length": nwords, "url": url}
def create_test_article(create=True, articleset=None, deduplicate=True, **kargs): """Create a test article""" from amcat.models.article import Article if "date" in kargs and isinstance(kargs["date"], str): kargs["date"] = read_date(kargs["date"]) if "project" not in kargs: kargs["project"] = create_test_project() if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1) if "medium" not in kargs: kargs["medium"] = create_test_medium() if 'headline' not in kargs: kargs['headline'] = 'test headline {} : {}'.format(_get_next_id(), uuid4()) if 'text' not in kargs: kargs["text"] = 'test text {}'.format(_get_next_id()) a = Article(**kargs) if create: Article.create_articles([a], articleset, deduplicate=deduplicate) return a
def parse_meta(text): m = re.match(r"(.*?)\s*(Nr. \d+)? vom (\d\d\.\d\d\.\d\d\d\d)( \d\d[.:]\d\d\b)?(.*)", text) if not m: raise ValueError("Cannot parse meta string {text!r}".format(**locals())) medium, nr, date, time, pagestr = m.groups() if medium.startswith('"') and medium.endswith('"'): medium = medium[1:-1] if time: date = date + time.replace(".", ":") date = toolkit.read_date(date) m = re.search("Seite:? (\d+)", pagestr) if m: page = int(m.group(1)) else: page = None return medium, date, page
def parse_online_article(art): # First, test for online articles with specific format blocks = re.split("\n *\n\s*", _strip_article(art)) if len(blocks) != 6: return medium, url, datestr, title, nwords, lead = blocks if not (url.startswith("http://") or url.startswith("https://")): return if lead.startswith("Bewaar lees artikel"): lead = lead[len("Bewaar lees artikel"):] m = re.match("(\d+) words", nwords) if not m: return nwords = int(m.group(1)) date = toolkit.read_date(datestr) return dict(title=title.strip(), text=lead.strip(), date=date, medium=medium, length_int=nwords, url=url)
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs): """Create a test article""" from amcat.models.article import Article if "date" in kargs and isinstance(kargs["date"], basestring): kargs["date"] = read_date(kargs["date"]) if "project" not in kargs: kargs["project"] = create_test_project() if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1) if "medium" not in kargs: kargs["medium"] = create_test_medium() if "id" not in kargs: kargs["id"] = _get_next_id() if 'headline' not in kargs: kargs['headline'] = 'test headline' if 'text' not in kargs: kargs["text"] = "\n\n".join(map(str, range(5))) a = Article(**kargs) if create: Article.create_articles([a], articleset, check_duplicate=check_duplicate, create_id=True) return a
def parse_meta(text): m = re.match(r"(.*?)\s*(Nr. \d+)? vom (\d\d\.\d\d\.\d\d\d\d)( \d\d[.:]\d\d\b)?(.*)", text) if not m: raise ValueError("Cannot parse meta string {text!r}".format(**locals())) medium, nr, date, time, pagestr = m.groups() if medium.startswith('"') and medium.endswith('"'): medium = medium[1:-1] if time: date = date + time.replace(".", ":") date = toolkit.read_date(date) m = re.search("Seite:? (\d+)", pagestr) if m: page = int(m.group(1)) else: page = None return medium, date, page
def parse_online_article(art): # First, test for online articles with specific format blocks = re.split("\n *\n\s*", _strip_article(art)) if len(blocks) != 6: return medium, url, datestr, title, nwords, lead = blocks if not (url.startswith("http://") or url.startswith("https://")): return if lead.startswith("Bewaar lees artikel"): lead = lead[len("Bewaar lees artikel"):] m = re.match("(\d+) words", nwords) if not m: return nwords = int(m.group(1)) date = toolkit.read_date(datestr) return dict(title=title.strip(), text=lead.strip(), date=date, medium=medium, length_int=nwords, url=url)
def parse_doc(document): # We select all 'div' elements directly under '.article' divs = document.cssselect("* > div") # Check for author field. If present: remove from metadata # fields list try: author_field = document.cssselect(".author")[0] except IndexError: pass else: yield "author", author_field.text_content().lstrip("Von").strip() divs.remove(author_field) # Strip everything before headline headline_field = document.cssselect("b.deHeadline")[0].getparent() divs = divs[divs.index(headline_field):] # Parse metadata. Loop through each 'div' within an article, along with # its field name according to META (thus based on its position) for field_name, element in zip(META, divs): if field_name is None: continue value = element.text_content().strip() if field_name == "length": value = int(value.rstrip("words")) elif field_name == "date": value = read_date(value) # WVA: WAS: raise NotImplemented("Parse date here: do not use toolkit.read_date")... why? elif field_name == "page": if value.strip().isdigit(): value = int(value.strip()) else: continue yield field_name, value # Fetch text, which is paragraphs = [p.text_content() for p in document.cssselect("p")] yield "text", ("\n\n".join(paragraphs)).strip()
def parse_document(self, file): if file: dirname, filename = os.path.split(file.name) filename, ext = os.path.splitext(filename) else: dirname, filename, ext = None, None, None metadata = dict((k, v) for (k, v) in self.options.items() if k in ["headline", "project", "date", "section"]) metadata["medium"] = Medium.get_or_create(self.options['medium']) if not metadata["date"]: datestring, filename = filename.split("_", 1) metadata["date"] = toolkit.read_date(datestring) if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["headline"].strip(): metadata["headline"] = filename if not metadata["section"].strip(): metadata["section"] = dirname if file: convertors = None if ext.lower() == ".docx": convertors = [_convert_docx, _convert_doc] elif ext.lower() == ".doc": convertors = [_convert_doc, _convert_docx] elif ext.lower() == ".pdf": convertors = [_convert_pdf] if convertors: text = _convert_multiple(file, convertors) else: text = "\n".join(file.readlines()) else: text = self.options['text'] return Article(text=text, **metadata)
def parse_doc(document): # We select all 'div' elements directly under '.article' divs = document.cssselect("* > div") # Check for author field. If present: remove from metadata # fields list try: author_field = document.cssselect(".author")[0] except IndexError: pass else: yield "author", author_field.text_content().lstrip("Von").strip() divs.remove(author_field) # Strip everything before headline headline_field = document.cssselect("b.deHeadline")[0].getparent() divs = divs[divs.index(headline_field):] # Parse metadata. Loop through each 'div' within an article, along with # its field name according to META (thus based on its position) for field_name, element in zip(META, divs): if field_name is None: continue value = element.text_content().strip() if field_name == "length": value = int(value.rstrip("words")) elif field_name == "date": value = read_date(value) # WVA: WAS: raise NotImplemented("Parse date here: do not use toolkit.read_date")... why? elif field_name == "page": if value.strip().isdigit(): value = int(value.strip()) else: continue yield field_name, value # Fetch text, which is paragraphs = [p.text_content() for p in document.cssselect("p")] yield "text", ("\n\n".join(paragraphs)).strip()
def parse_document(self, tupleText): meta, body = tupleText meta = meta.strip() meta = meta.split('\n') kargs = { 'externalid': int(meta[0].split('.')[0].lstrip('?')), 'headline': meta[0].partition('. ')[2] } medium_name, date, pagenr, length = meta[2].split(', ') kargs['medium'] = Medium.get_or_create(medium_name) kargs['date'] = read_date(date) kargs['pagenr'] = int(pagenr.strip('p.')) kargs['length'] = int(length.strip('w.')) body = body.split('\n') kargs['section'] = body[2] kargs['text'] = '\n'.join(body[5:]) kargs['project'] = self.options['project'] return Article(**kargs)
def parse_document(self, tupleText): meta, body = tupleText meta = meta.strip() meta = meta.split('\n') kargs = { 'externalid': int(meta[0].split('.')[0].lstrip('?')), 'headline': meta[0].partition('. ')[2] } medium_name, date, pagenr, length = meta[2].split(', ') kargs['medium'] = Medium.get_or_create(medium_name) kargs['date'] = read_date(date) kargs['pagenr'] = int(pagenr.strip('p.')) kargs['length'] = int(length.strip('w.')) body = body.split('\n') kargs['section'] = body[2] kargs['text'] = '\n'.join(body[5:]) kargs['project'] = self.options['project'] return Article(**kargs)
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs): """Create a test article""" from amcat.models.article import Article if "date" in kargs and isinstance(kargs["date"], basestring): kargs["date"] = read_date(kargs["date"]) if "project" not in kargs: kargs["project"] = create_test_project() if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1) if "medium" not in kargs: kargs["medium"] = create_test_medium() if "id" not in kargs: kargs["id"] = _get_next_id() if 'headline' not in kargs: kargs['headline'] = 'test headline' if 'text' not in kargs: kargs["text"] = "\n\n".join(map(str, range(5))) a = Article(**kargs) if create: Article.create_articles([a], articleset, check_duplicate=check_duplicate, create_id=True) return a
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = read_date(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): article.title = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = read_date(datestr) if ( article.date - file_date ).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = read_date(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.set_property("medium", medium_str) article.set_property("section", data[1]) paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len( p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode( 'latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub("=[A-Z0-9]{2}", character, article.text) yield article
def parse_article(art): """ A lexis nexis article consists of five parts: 1) a header 2) the title and possibly a byline 3) a block of meta fields 4) the body 5) a block of meta fields The header consists of 'centered' lines, ie starting with a whitespace character The title (and byline) are left justified non-marked lines before the first meta field The meta fields are of the form FIELDNAME: value and can contain various field names The body starts after either two blank lines, or if a line is not of the meta field form. The body ends with a 'load date', which is of form FIELDNAME: DATE ending with a four digit year """ online = parse_online_article(art) if online: return online header, title, meta, body = [], [], [], [] header_headline = [] def next_is_indented(lines, skipblank=True): if len(lines) <= 1: return False if not lines[1].strip(): if not skipblank: return False return next_is_indented(lines[1:]) return lines[1].startswith(" ") def followed_by_date_block(lines): # this text is followed by a date block # possibly, there is another line in the first block # (blank line) # indented date line # optional second indented date line # (blank line) if len(lines) < 5: return False if ((not lines[1].strip()) and lines[2].startswith(" ") and (not lines[3].strip())): return True if ((not lines[1].strip()) and lines[2].startswith(" ") and lines[2].startswith(" ") and (not lines[4].strip())): return True if not lines[1].strip(): return False if lines[1].startswith(" "): return False return followed_by_date_block(lines[1:]) def _in_header(lines): if not lines: return False if not lines[0].strip(): return True # blank line # indented line spanning page width: header if (not lines[0].startswith(" ") and next_is_indented(lines, skipblank=False) and len(lines[0].strip()) > 75): return True # non-indented TITLE or normal line followed by indented line if (not lines[0].startswith(" ")) and next_is_indented(lines): header_headline.append(lines.pop(0)) else: while (not lines[0].startswith(" ") ) and followed_by_date_block(lines): header_headline.append(lines.pop(0)) # check again after possible removal of header_headline if not lines: return False if not lines[0].strip(): return True # blank line if lines[0].startswith(" "): return True # indented line def _get_header(lines) -> dict: """Consume and return all lines that are indented (ie the list is changed in place)""" while _in_header(lines): line = lines.pop(0) line = line.strip() if line: if re.match('Copyright \d{4}', line): line = line[len('Copyright xxxx'):] yield line def _get_headline(lines): """Return title and byline, consuming the lines""" headline, byline = [], [] target = headline while lines: line = lines[0].strip() if RES.BODY_META.match(line): return None, None if not line: # they thought of something new again... # title\n\nbyline\n\nLENGTH: # so empty line is not always the end if (len(lines) > 4 and (not lines[2]) and lines[1] and RES.BODY_META.match(lines[3]) and (not RES.BODY_META.match(lines[1]))): target = byline else: break if line.endswith(";"): target.append(line[:-1]) target = byline else: target.append(line) del lines[0] return (re.sub("\s+", " ", " ".join(x)) if x else None for x in (headline, byline)) def _get_meta(lines) -> dict: """ Return meta key-value pairs. Stop if body start criterion is found (eg two blank lines or non-meta line) """ while lines: line = lines[0].strip() next_line = lines[1].strip() if len(lines) >= 2 else None meta_match = RES.BODY_META.match(line) if ((not bool(line) and not bool(next_line)) or (line and not meta_match)): # either two blank lines or a non-meta line # indicate start of body, so end of meta break del lines[0] if meta_match: key, val = meta_match.groups() key = key.lower() key = BODY_KEYS_MAP.get(key, key) # multi-line meta: add following non-blank lines while lines and lines[0].strip(): val += " " + lines.pop(0) val = re.sub("\s+", " ", val) yield key, val.strip() def _get_body(lines): """split lines into body and postmatter""" # index of headline or end of body try: i = next(i for (i, line) in enumerate(lines) if RES.BODY_END_OR_COPYRIGHT.match(line.strip())) return lines[:i], lines[i:] except StopIteration: return lines, [] lines = _strip_article(art).split("\n") header = list(_get_header(lines)) if not lines: # Something is wrong with this article, skip it return if header_headline: title = re.sub("\s+", " ", " ".join(header_headline)).strip() if ";" in title: title, byline = [x.strip() for x in title.split(";", 1)] else: byline = None if re.match("[A-Z]+:", title): title = title.split(":", 1)[1] else: title, byline = _get_headline(lines) meta = dict(_get_meta(lines)) if title is None: if 'title' in meta: title = meta.pop('title') elif 'kop' in meta: title = meta.pop('kop') body, lines = _get_body(lines) meta.update(dict(_get_meta(lines))) def _get_source(lines, i): source = lines[0 if i > 0 else 1] if source.strip() in ("PCM Uitgevers B.V.", "De Persgroep Nederland BV" ) and i > 2 and lines[i - 1].strip(): source = lines[i - 1] return source date, dateline, source = None, None, None for i, line in enumerate(header): if _is_date(line): date = line dateline = i source = _get_source(header, i) break if date is None: # try looking for only month - year notation by preprending a 1 for i, line in enumerate(header): line = "1 {line}".format(**locals()) if _is_date(line): date = line source = _get_source(header, i) if date is None: # try looking for season names #TODO: Hack, reimplement more general! for i, line in enumerate(header): if line.strip() == "Winter 2008/2009": date = "2009-01-01" source = _get_source(header, i) def find_re_in(pattern, lines): for line in lines: m = re.search(pattern, line) if m: return m if date is None: yearmatch = find_re_in("(.*)(\d{4})$", header) if yearmatch: month, year = yearmatch.groups() month = MONTHS.get(month.replace(",", "").strip().lower(), 1) date = "{year}-{month:02}-01".format(**locals()) source = header[0] # this is probably a journal, let's see if we can find an issue issuematch = find_re_in("[-\d]+[^\d]+\d+", header) if issuematch: meta['issue'] = issuematch.group(0) elif [x.strip() for x in header] in (["India Today"], ["Business Today"]): date = meta.pop("load-date") source = header[0] else: raise ParseError( "Couldn't find date in header: {header!r}\n{art!r}".format( **locals())) date = toolkit.read_date(date) if dateline is not None and len(header) > dateline + 1: # next line might contain time timeline = header[dateline + 1] m = re.search(r"\b\d?\d:\d\d\s(PM\b)?", timeline) if m and date.time().isoformat() == '00:00:00': time = toolkit.read_date("1990-01-01 {}".format(m.group(0))) datestr = " ".join([date.isoformat()[:10], m.group(0)]) date = toolkit.read_date(datestr) m = re.match("copyright\s\xa9?\s?(\d{4})?(.*)", source, re.I) if m: source = m.group(2) source = source.strip() text = "\n".join(body).strip() if 'graphic' in meta and (not text): text = meta.pop('graphic') if title is None: if 'headline' in meta and 'title' not in meta: meta['title'] = meta.pop('headline') if 'title' in meta: title = re.sub("\s+", " ", meta.pop('title')).strip() if ";" in title and not byline: title, byline = [x.strip() for x in title.split(";", 1)] else: title = "No title found!" if 'byline' in meta: if byline: title += "; %s" % byline byline = meta.pop('byline') if 'length' in meta: meta['length_int'] = meta.pop('length') if 'length_int' in meta: meta['length_int'] = int(meta['length_int'].split()[0]) meta.update( dict(title=title.strip(), byline=byline, text=text, date=date, medium=source)) meta = {k: v for (k, v) in meta.items() if v} return meta
def parse_page(doc_elements): """Parses an APA page given in a list of Etree elements.""" doc, elements = doc_elements elements = [e for e in elements if not isinstance(e, lxml.html.HtmlComment)] element_set = set(elements) result = try_alternative(elements) if result is not None: return result source_tags = doc.cssselect('meta[name=author]') if source_tags: source = source_tags[0].get('content') else: source = None headline = set(get_descendants(doc.cssselect("b"))) & element_set meta = (set(get_descendants(doc.cssselect("i"))) & element_set) - headline text = element_set - (headline | meta) headline = sorted(get_roots(headline), key=lambda e: elements.index(e)) # Some formats don't have a bold headline. Instead, the first line is the headline. first_line_is_headline = False if not headline and source == "AOMweb": first_line_is_headline = True if not headline and not first_line_is_headline: raise ApaError("No possible headlines found.") remove_tree(meta, ["b"]) remove_tree(text, ["b", "i"]) # Some text in italics is no metadata. We only use text before headline elements # for metadata. if not first_line_is_headline: lesser_than_headline = lambda e: elements.index(e) <= elements.index(headline[0]) meta = get_nonempty(filter(lesser_than_headline, meta)) else: meta = get_nonempty(meta) # Parse metadata metadata = {} for el in list(meta): if get_metadata(metadata, el): meta.remove(el) if meta: metadata["byline"] = " - ".join(m.text for m in meta) # Convert date properties to datetime object year, month, day = metadata["year"], metadata["month"], metadata["day"] hour, minute = metadata.get("hour"), metadata.get("minute") datestring = "{day}.{month}.{year}" if hour is not None and minute is not None: datestring += ", {hour}:{minute}" metadata["date"] = read_date(datestring.format(**locals())) for prop in ("year", "month", "day", "hour", "minute"): if prop in metadata: del metadata[prop] # Clean data and get headline metadata["medium"] = metadata.get("medium", "APA - Unknown").strip().strip('"') if first_line_is_headline: medium = metadata["medium"].strip() else: medium, headline = metadata["medium"].strip(), "".join(["".join(e.itertext()) for e in headline]).strip() if medium in headline: headline = headline.split("-", medium.count("-") + 1)[-1] if "section" in metadata and metadata["section"] is None: del metadata["section"] # Get text. Since ordering is lost in sets, restore original order of elements text = "".join(get_text(sorted(text, key=lambda e: elements.index(e)))).strip() if first_line_is_headline: headline, text = re.split("\n *\n", text, 1) metadata["title"] = headline metadata["length"] = sum(1 for w in RE_NONWORD.split(text) if w) return metadata, text
def parse_date(d): if isinstance(d, list) and len(d) == 1: d = d[0] if isinstance(d, str): d = toolkit.read_date(d) return d.isoformat()
def parse_date(d): if isinstance(d, list) and len(d) == 1: d = d[0] if isinstance(d, str): d = toolkit.read_date(d) return d.isoformat()
def _scrape_unit(self, _file): readlines = _file.readlines() file_date_line = [l for l in readlines if l.startswith("Date:")][0] file_date = read_date(file_date_line.split("Date:")[1]) lines = [] mail_header = [] for line in readlines: if lines: lines.append(line.rstrip("\r\n")) else: mail_header.append(line) if line.startswith("1red"): #actual content starts lines.append("") article = Article(metastring={'mail_header': "".join(mail_header)}) while True: #loop through lines up to and including headline line = lines.pop(0) if line.isupper(): article.title = line break elif line: #first non-empty line, contains metadata data = line.split(", ") datestr = data[0] if "'" in datestr: split = datestr.split("'") datestr = split[0] + "20" + split[1] if "=" in datestr: # if this is true, the year is not parsable # we take the year the mail was sent, might fail around december datestr = datestr.split("=")[0] + str(file_date.year) article.date = read_date(datestr) if ( article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year article.date -= timedelta(years=1) else: article.date = read_date(datestr) if data[2] in BZK_ALIASES.keys(): medium_str = BZK_ALIASES[data[1]] else: medium_str = data[2] article.set_property("medium", medium_str) article.set_property("section", data[1]) paragraphs = [] paragraph = "" while True: line = lines.pop(0).rstrip("=") if not line: paragraphs.append(paragraph) paragraph = "" elif line.isupper(): #subheader paragraph += line + "\n" else: paragraph += line if not lines: break paragraphs = [p for p in paragraphs if p] article.text = "" for p in paragraphs: article.text += p + "\n\n" if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content break # Add non-ascii characters # Takes the '=AB' occurrences and turns them into latin-1 characters. def character(match): code = match.group()[1:] char = r"\x{}".format(code).decode('string-escape').decode('latin-1') if code == "92": return "'" elif code == "85": return "..." return char article.text = re.sub( "=[A-Z0-9]{2}", character, article.text) yield article
def parse_article(art): """ A lexis nexis article consists of five parts: 1) a header 2) the title and possibly a byline 3) a block of meta fields 4) the body 5) a block of meta fields The header consists of 'centered' lines, ie starting with a whitespace character The title (and byline) are left justified non-marked lines before the first meta field The meta fields are of the form FIELDNAME: value and can contain various field names The body starts after either two blank lines, or if a line is not of the meta field form. The body ends with a 'load date', which is of form FIELDNAME: DATE ending with a four digit year """ online = parse_online_article(art) if online: return online header, title, meta, body = [], [], [], [] header_headline = [] metadata_lang = None def next_is_indented(lines, skipblank=True): if len(lines) <= 1: return False if not lines[1].strip(): if not skipblank: return False return next_is_indented(lines[1:]) return lines[1].startswith(" ") def followed_by_date_block(lines): # this text is followed by a date block # possibly, there is another line in the first block # (blank line) # indented date line # optional second indented date line # (blank line) if len(lines) < 5: return False if ((not lines[1].strip()) and lines[2].startswith(" ") and (not lines[3].strip())): return True if ((not lines[1].strip()) and lines[2].startswith(" ") and lines[2].startswith(" ") and (not lines[4].strip())): return True if not lines[1].strip(): return False if lines[1].startswith(" "): return False return followed_by_date_block(lines[1:]) def _in_header(lines): if not lines: return False if not lines[0].strip(): return True # blank line # indented line spanning page width: header if (not lines[0].startswith(" ") and next_is_indented(lines, skipblank=False) and len(lines[0].strip()) > 75): return True # non-indented TITLE or normal line followed by indented line if (not lines[0].startswith(" ")) and next_is_indented(lines): header_headline.append(lines.pop(0)) else: while (not lines[0].startswith(" ")) and followed_by_date_block(lines): header_headline.append(lines.pop(0)) # check again after possible removal of header_headline if not lines: return False if not lines[0].strip(): return True # blank line if lines[0].startswith(" "): return True # indented line def _get_header(lines) -> dict: """Consume and return all lines that are indented (ie the list is changed in place)""" while _in_header(lines): line = lines.pop(0) line = line.strip() if line: if re.match('Copyright \d{4}', line): line = line[len('Copyright xxxx'):] yield line def _get_headline(lines): """Return title and byline, consuming the lines""" headline, byline = [], [] target = headline while lines: line = lines[0].strip() if RES.BODY_META.match(line): return None, None if not line: # they thought of something new again... # title\n\nbyline\n\nLENGTH: # so empty line is not always the end if (len(lines) > 4 and (not lines[2]) and lines[1] and RES.BODY_META.match(lines[3]) and (not RES.BODY_META.match(lines[1]))): target = byline else: break if line.endswith(";"): target.append(line[:-1]) target = byline else: target.append(line) del lines[0] return (re.sub("\s+", " ", " ".join(x)) if x else None for x in (headline, byline)) def _get_meta(lines, after_body=False) -> Iterable[Tuple[str, str, str]]: """ Return meta key-value pairs. Stop if body start criterion is found (eg two blank lines or non-meta line) """ nonlocal metadata_lang while lines: line = lines[0].strip() next_line = lines[1].strip() if len(lines) >= 2 else None meta_match = RES.BODY_META.match(line) if ((not bool(line) and not bool(next_line)) or (line and not meta_match)): # either two blank lines or a non-meta line # indicate start of body, so end of meta break if meta_match and not after_body: # if the key is not known, and the next non-empty line is body, treat this line as part of body key, val = meta_match.groups() if val.strip() and not key.lower() in WELL_KNOWN_BODY_KEYS: def next_block(lines): found_blank = False for l in lines: l = l.strip() if not l: found_blank = True elif found_blank: return l next_line = next_block(lines) if next_line and not RES.BODY_META.match(next_line): break del lines[0] if meta_match: key, val = meta_match.groups() orig_key = key key = key.lower() # detect language before mapping to English if metadata_lang is None and key in METADATA_LANGUAGE_MAP: metadata_lang = METADATA_LANGUAGE_MAP[key] key = BODY_KEYS_MAP.get(key, key) # multi-line meta: add following non-blank lines while lines and lines[0].strip(): val += " " + lines.pop(0) val = re.sub("\s+", " ", val) yield orig_key, key, val.strip() def _get_body(lines): """split lines into body and postmatter""" # index of headline or end of body try: i = next(i for (i, line) in enumerate(lines) if RES.BODY_END_OR_COPYRIGHT.match(line.strip())) return lines[:i], lines[i:] except StopIteration: return lines, [] lines = _strip_article(art).split("\n") header = list(_get_header(lines)) if not lines: # Something is wrong with this article, skip it return if header_headline: title = re.sub("\s+", " ", " ".join(header_headline)).strip() if ";" in title: title, byline = [x.strip() for x in title.split(";", 1)] else: byline = None if re.match("[A-Z]+:", title): title = title.split(":", 1)[1] else: title, byline = _get_headline(lines) head_meta_fields = list(((ok, k), (k, v)) for ok, k, v in _get_meta(lines)) orig_keys, meta = zip(*head_meta_fields) if head_meta_fields else ((), ()) orig_keys = OrderedDict(orig_keys) meta = dict(meta) if title is None: if 'title' in meta: title = meta.pop('title') elif 'kop' in meta: title = meta.pop('kop') body, lines = _get_body(lines) meta.update({k: v for _, k, v in _get_meta(lines, after_body=True)}) def _get_source(lines, i): source = lines[0 if i > 0 else 1] if source.strip() in ("PCM Uitgevers B.V.", "De Persgroep Nederland BV") and i > 2 and lines[i - 1].strip(): source = lines[i - 1] return source def _get_date_languages(meta, metadata_lang, body): article_langs = [lang.lower().strip() for lang in RES.SPLIT_LANGUAGES.split(meta.get('language', "")) if lang != ""] if metadata_lang is None: log.debug("Failed to detect metadata language. Falling back to defaults") return None if not article_langs: # failed to guess language, fall back to default return None article_langs.append(metadata_lang) return tuple(article_langs) lang_pool = _get_date_languages(meta, metadata_lang, body) date, dateline, source = None, None, None for i, line in enumerate(header): if _is_date(line, language_pool=lang_pool): date = line dateline = i source = _get_source(header, i) break if date is None: # try looking for only month - year notation by preprending a 1 for i, line in enumerate(header): line = "1 {line}".format(**locals()) if _is_date(line, language_pool=lang_pool): date = line source = _get_source(header, i) if date is None: # try looking for season names # TODO: Hack, reimplement more general! for i, line in enumerate(header): if line.strip() == "Winter 2008/2009": date = "2009-01-01" source = _get_source(header, i) def find_re_in(pattern, lines): for line in lines: m = re.search(pattern, line) if m: return m if date is None: yearmatch = find_re_in("(.*)(\d{4})$", header) if yearmatch: month, year = yearmatch.groups() month = MONTHS.get(month.replace(",", "").strip().lower(), 1) date = "{year}-{month:02}-01".format(**locals()) source = header[0] # this is probably a journal, let's see if we can find an issue issuematch = find_re_in("[-\d]+[^\d]+\d+", header) if issuematch: meta['issue'] = issuematch.group(0) elif [x.strip() for x in header] in (["India Today"], ["Business Today"]): date = meta.pop("load-date") source = header[0] else: raise ParseError("Couldn't find date in header: {header!r}\n{art!r}".format(**locals())) date = toolkit.read_date(date) if dateline is not None and len(header) > dateline + 1: # next line might contain time timeline = header[dateline + 1] m = re.search(r"\b\d?\d:\d\d\s(PM\b)?", timeline) if m and date.time().isoformat() == '00:00:00': time = toolkit.read_date("1990-01-01 {}".format(m.group(0))) datestr = " ".join([date.isoformat()[:10], m.group(0)]) date = toolkit.read_date(datestr) m = re.match("copyright\s\xa9?\s?(\d{4})?(.*)", source, re.I) if m: source = m.group(2) source = source.strip() text = "\n".join(body).strip() if 'graphic' in meta and (not text): text = meta.pop('graphic') if title is None: if 'headline' in meta and 'title' not in meta: meta['title'] = meta.pop('headline') if 'title' in meta: title = re.sub("\s+", " ", meta.pop('title')).strip() if ";" in title and not byline: title, byline = [x.strip() for x in title.split(";", 1)] else: # test if title was mistakenly parsed as a meta fields. title_mistake = next(iter(orig_keys.items())) if title_mistake[0].lower() not in WELL_KNOWN_BODY_KEYS: val = meta.pop(title_mistake[1]) title = "{}: {}".format(title_mistake[1], val) else: title = "No title found!" if 'byline' in meta: if byline: title += "; %s" % byline byline = meta.pop('byline') if 'length' in meta: meta['length_int'] = meta.pop('length') if 'length_int' in meta: meta['length_int'] = int(meta['length_int'].split()[0]) meta.update(dict(title=title.strip(), byline=byline, text=text, date=date, medium=source)) meta = {k: v for (k, v) in meta.items() if v} return meta