def _is_date(string): try: toolkit.readDate(string) except ValueError: return False return True
def get_comments(self, page): for li in page.doc.cssselect("ul.commentlist li.comment"): comment = HTMLDocument() comment.parent = page try: dateauthor = li.cssselect("div.commentsbox")[0].text_content() except IndexError: comment.props.author = li.text_content().split(":")[0] comment.props.date = readDate(":".join(li.text_content().split(":")[1:2])) try: comment.props.text = li.cssselect("div.comment-text-reply")[0] except UnicodeDecodeError: continue else: comment.props.author = dateauthor.split("Geplaatst door")[1].split(" op ")[0] try: li.cssselect("div.commentsbox a")[0].drop_tree() except: pass comment.props.date = readDate(dateauthor.split(" op ")[1]) try: comment.props.text = li.cssselect("div.comment-text")[0] except UnicodeDecodeError: continue yield comment
def find_start(self, n_articles): """Intelligently find the page at which the articles are for the given date, saves hours""" jump_distance = n_articles / 4. index = n_articles / 2 offset = int(math.ceil((index) / 10) * 10) #find an article with the right date while True: offset = int(math.ceil(index / 10) * 10) docs = self.getresponse(offset)["docs"] dates = [readDate(d["date"]).date() for d in docs] if self.options['date'] in dates: break elif self.options['date'] > dates[0]: index -= jump_distance elif self.options['date'] < dates[0]: index += jump_distance if jump_distance < 10: return 0 jump_distance /= 2. #go back to first occurrence i = 0 while self.options['date'] in dates: i += 1 offset -= 10 * i if offset < 0: return 0 docs = self.getresponse(offset)["docs"] dates = [readDate(d["date"]).date() for d in docs] return offset
def test_post(self): """Test whether posting and retrieving an article works correctly""" self.set_up() p = amcattest.create_test_project(owner=self.user) s = amcattest.create_test_set(project=p) a = { 'date': datetime.datetime.now().isoformat(), 'headline': 'Test child', 'medium': 'Fantasy', 'text': 'Hello Universe', 'pagenr': 1, 'url': 'http://example.org', 'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af', } url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format(**locals()) self.post(url, a, self.user) amcates.ES().flush() res = self.get(url)["results"] self.assertEqual(len(res), 1) self.assertEqual(res[0]["headline"], a['headline']) self.assertEqual(toolkit.readDate(res[0]["date"]), toolkit.readDate(a['date'])) self.assertEqual(res[0]["uuid"], a['uuid'])
def parse_item(self, item): #item: a list of html tags article = HTMLDocument() for tag in item: if tag.tag == "p": if hasattr(article.props, 'text'): article.props.text.append(tag) else: article.props.text = [tag] elif tag.tag == "h2": article.props.headline = tag.text elif tag.tag == "i": bits = tag.text.split() if not bits: # empty knipsel return if "-" in bits[-1]: try: article.props.date = readDate(bits[-1]) except ValueError: article.props.date = None article.props.medium = self.get_medium(" ".join(bits[:-1])) elif bits[-1].isdigit(): try: article.props.date = readDate(" ".join(bits[-3:])) except ValueError: article.props.date = None article.props.medium = self.get_medium(" ".join(bits[:-3])) else: article.props.medium = self.get_medium(" ".join(bits)) article.props.date = None return article
def test_post(self): """Test whether posting and retrieving an article works correctly""" self.set_up() p = amcattest.create_test_project(owner=self.user) s = amcattest.create_test_set(project=p) a = { 'date': datetime.datetime.now().isoformat(), 'headline': 'Test child', 'medium': 'Fantasy', 'text': 'Hello Universe', 'pagenr': 1, 'url': 'http://example.org', 'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af', } url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format( **locals()) self.post(url, a, self.user) amcates.ES().flush() res = self.get(url)["results"] self.assertEqual(len(res), 1) self.assertEqual(res[0]["headline"], a['headline']) self.assertEqual(toolkit.readDate(res[0]["date"]), toolkit.readDate(a['date'])) self.assertEqual(res[0]["uuid"], a['uuid'])
def get_comments(self, page): for li in page.doc.cssselect("ul.commentlist li.comment"): comment = HTMLDocument() comment.parent = page try: dateauthor = li.cssselect("div.commentsbox")[0].text_content() except IndexError: comment.props.author = li.text_content().split(":")[0] comment.props.date = readDate(":".join( li.text_content().split(":")[1:2])) try: comment.props.text = li.cssselect( "div.comment-text-reply")[0] except UnicodeDecodeError: continue else: comment.props.author = dateauthor.split( "Geplaatst door")[1].split(" op ")[0] try: li.cssselect("div.commentsbox a")[0].drop_tree() except: pass comment.props.date = readDate(dateauthor.split(" op ")[1]) try: comment.props.text = li.cssselect("div.comment-text")[0] except UnicodeDecodeError: continue yield comment
def _scrape_unit(self, li): a = li.cssselect("li > a")[0] article = HTMLDocument(url=urljoin(self.index_url, a.get('href'))) article.props.headline = a.text article.props.kicker = li.cssselect("div.infoboard a.kicker")[0].text article.props.intro = li.cssselect("p") article.props.date = readDate( li.cssselect("div.infoboard span.time")[0].text_content()) article.prepare(self) articletime = article.doc.cssselect("p.articletime")[0].text_content() if len(articletime.split("|")) > 2: article.props.date = readDate(" ".join( articletime.split("|")[:-1])) article.props.author = articletime.split("|")[-1] else: article.props.author = articletime.strip() if " Korrespondent" in article.props.author: article.props.author = article.props.author.split( "Korrespondent")[1].strip() for ad in article.doc.cssselect("div.noprint"): ad.drop_tree() article.props.text = article.doc.cssselect( "p.articlelead, #articletext") article.props.section = article.doc.cssselect( "div.headtop span.sitetop")[0].text_content() yield article
def test_post(self): """Test whether posting and retrieving an article works correctly""" a = test_article() res = self._post_articles(a) self.assertEqual(set(res.keys()), {'id'}) # POST should only return IDs res = self._get_article(aid=res['id']) self.assertEqual(res["headline"], a['headline']) self.assertEqual(toolkit.readDate(res["date"]), toolkit.readDate(a['date'])) self.assertNotIn("text", res.keys()) self.assertIsNotNone(res["uuid"]) res = self._get_article(aid=res['id'], text=True) self.assertEqual(res["text"], a['text']) res = self._get_articles()["results"] self.assertEqual(len(res), 1) # can we post explicit UUID? self.setUp_set() a['uuid'] = str(uuid4()) self._post_articles(a) res = self._get_articles()["results"] self.assertEqual(res[0]["uuid"], a['uuid'])
def test_get(self): p1 = amcattest.create_test_project(name="testnaam", description="testdescription", insert_date='2012-01-01') actual = self.get(ProjectResource, id=p1.id) actual_results = actual.pop("results") self.assertEqual(len(actual_results), 1) actual_results = actual_results[0] date = actual_results.pop('insert_date') readDate(date)# check valid date, not much more to check here? expected_results={u'insert_user': p1.insert_user.id, u'description': 'testdescription', u'name': u'testnaam', u'guest_role': 11, u'owner': p1.owner.id, u'active': True, u'id': p1.id, u'favourite' : False, } expected_meta = { u'page' : 1, u'next' : None, u'previous' : None, u'per_page' : 10, u'total' : 1, u'pages' : 1, u'echo' : None, } self.assertDictsEqual(actual, expected_meta) self.assertDictsEqual(actual_results, expected_results)
def _scrape_unit(self, page): page.prepare(self) if page.doc.cssselect("form#_caps_form"): return header = page.doc.cssselect("div.time_post")[0].text_content() pattern = re.compile(r'(Bewerkt door:)?([a-zA-Z0-9 ]+)?(\u2212)?\n((\d{2,2}/){2,2}\d{2,2}), \d{2,2}:\d{2,2}\n(\xa0\u2212\xa0bron: ([A-Za-z0-9 ,]+))?') try: groups = pattern.search(header).groups() except AttributeError: #rare error where regex fails page.props.date = readDate(header) else: page.props.date = readDate(groups[3]) if groups[0] or (not groups[1]): page.props.author = groups[-1] elif groups[1]: page.props.author = groups[1] if not hasattr(page.props,"author") and page.doc.cssselect("span.author"): page.props.author = page.doc.cssselect("span.author")[0].text_content() if hasattr(page.props,"author"): if page.props.author: page.props.author = page.props.author[:98] page.props.text = page.doc.cssselect("#art_box2 p") page.props.html = html.tostring(page.doc) try: page.props.section = page.doc.cssselect("#subnav_nieuws li span.nieuws")[0].text_content() except IndexError: if page.doc.cssselect("div.dos_default h2"): page.props.section = "dossier: {}".format(page.doc.cssselect("div.dos_default h2")[0].text) yield page
def test_dates(self): """Test whether date deserialization works, see #66""" for d in ('2001-01-01', '1992-12-31T23:59', '2012-02-29T12:34:56.789', datetime.datetime.now()): a = amcattest.create_test_article(date=d) amcates.ES().flush() res = self.get("/api/v4/search", ids=a.id) self.assertEqual(toolkit.readDate(res['results'][0]['date']), toolkit.readDate(str(d)))
def parse_dateline(self, text, article): bits = text.split() if "-" in bits[-1]: article.date = readDate(bits[-1]) article.medium = self.get_medium(" ".join(bits[:-1])) elif bits[-1].isdigit() and bits[-3].isdigit(): article.date = readDate(" ".join(bits[-3:])) article.medium = self.get_medium(" ".join(bits[:-3])) else: article.medium = self.get_medium(" ".join(bits)) article.date = None return article
def get_comments(self, doc): for div in doc.cssselect("#commentsList div.topDivider"): comment = HTMLDocument() comment.props.text = div.cssselect("div.wordBreak")[0] spans = div.cssselect("div.fBold span") try: comment.props.date = readDate(spans[1].text_content().split(" ")[1]) except ValueError: comment.props.date = readDate(spans[1].text_content()) comment.props.author = spans[0].text_content().strip() comment.props.url = doc.url yield comment
def parse_dateline(self, text, article): bits = text.split() if "-" in bits[-1]: article.date = readDate(bits[-1]) article.medium = self.get_medium(" ".join(bits[:-1])) elif bits[-1].isdigit(): article.date = readDate(" ".join(bits[-3:])) article.medium = self.get_medium(" ".join(bits[:-3])) else: article.medium = self.get_medium(" ".join(bits)) article.date = None return article
def get_comments(self, doc): for div in doc.cssselect("#commentsList div.topDivider"): comment = HTMLDocument() comment.props.text = div.cssselect("div.wordBreak")[0] spans = div.cssselect("div.fBold span") try: comment.props.date = readDate( spans[1].text_content().split(" ")[1]) except ValueError: comment.props.date = readDate(spans[1].text_content()) comment.props.author = spans[0].text_content().strip() yield comment
def _extract(self, doc): #get articles from section page. return False if out of date bounds for li in doc.cssselect("#content ul li"): if "short-news" in doc.url: url = li.cssselect("div.text-holder a")[0].get('href') date = readDate(self.getdoc(url).cssselect("#content em.date a")[0].text) else: url = li.cssselect("div.heading a")[0].get('href') date = readDate(li.cssselect("em.date a")[0].text) if date.date() < self.options['date']: yield False if date.date() == self.options['date']: yield url
def _get_units(self): initial_url = self.search_url.format(p=1) initial_doc = self.getdoc(initial_url) dates = [ readDate(article.cssselect("span.date")[0].text).date() for article in initial_doc.cssselect("div.subarticle") ] self.maxdate = max(dates) n_results = int(initial_doc.cssselect("#searchlist header h1")[0].text.strip().split(" ")[-1]) for page in self.pinpoint_pages(n_results): for div in page.cssselect("div.subarticle"): date = readDate(div.cssselect("span.date")[0].text).date() if date == self.options["date"]: url = div.cssselect("h2 a")[0].get("href") yield url
def get_article_dict(art, sets=None): date = art.date if date: if isinstance(art.date, (str, unicode)): date = toolkit.readDate(date) date = date.isoformat() d = dict( # dublin core elements id = art.id, headline=_clean(art.headline), text=_clean(art.text), date=date, creator=_clean(art.author), # other elements projectid=art.project_id, mediumid=art.medium_id, medium=art.medium.name, byline=_clean(art.byline), section=_clean(art.section), page=art.pagenr, addressee=_clean(art.addressee), length=art.length, sets = sets ) d['hash'] = _get_hash(d) return d
def _scrape_unit(self, topic_url): #navigate to last page, then navigate back until comments are no longer recent doc = self.getdoc(topic_url) headline = "".join( doc.cssselect("title")[0].text_content().split("-")[:-1]) topic_date = readDate( doc.cssselect("span#pt1")[0].text_content().strip()) try: parent = Article.objects.get(headline=headline, date=topic_date) except Article.MultipleObjectsReturned: #duplicate in 99.99% of the cases parents = Article.objects.filter(headline=headline, date=topic_date) min_id = min([parent.id for parent in parents ]) #deduplicate usually keeps the lowest id parent = parents.get(pk=min_id) except Article.DoesNotExist: parent = HTMLDocument(url=topic_url) parent.props.headline = headline parent.props.date = topic_date parent.props.text = doc.cssselect("div.postmain_right")[0] parent.props.author = doc.cssselect( "span.post_sub a.username")[0].text_content().strip() parent.props.section = self.current_section for post in self.get_posts(doc): post.props.parent = parent post.props.url = hasattr( parent, 'props') and parent.props.url or parent.url yield post if isinstance(parent, Document): yield parent
def _scrape_unit(self, article_id): article = HTMLDocument(url=self.article_url.format(**locals())) article.prepare(self) article.props.text = article.doc.cssselect("font.artbody") if len("".join([t.text_content() for t in article.props.text])) < 100: return for i, table in enumerate(article.doc.cssselect("table")): if table.get('class') == "body": table_after_body = article.doc.cssselect("table")[i + 1] page_date = re.search( "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})", table_after_body.text_content()) article.props.pagenr = page_date.group(1) article.props.date = readDate(page_date.group(2)) article.props.section = self.current_section article.props.headline = article.doc.cssselect( "td.artheader")[0].text_content().strip() if article.doc.cssselect(".artsubheader"): article.props.byline = article.doc.cssselect(".artsubheader")[0] if article.doc.cssselect("td.artauthor"): article.props.author = article.doc.cssselect( "td.artauthor")[0].text.split(":")[1].strip() dateline_match = re.search( "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n", "\n".join([n.text_content() for n in article.props.text]).strip()) if dateline_match: article.props.dateline = dateline_match.group(1) yield article
def _scrape_unit(self, unit): url, section = unit if not section: section = url.split("/")[3] doc = self.getdoc(url) try: headline = doc.cssselect("#artikel h1")[0].text_content() except IndexError: return #no headline, no article article_dict = { 'url' : url, 'text' : doc.cssselect("#broodtekst")[0], 'headline' : headline, 'section' : section, 'author' : doc.cssselect("div.author") and doc.cssselect("div.author a")[0].text or None, 'date' : readDate(doc.cssselect("#midden time")[0].get('datetime')), 'children' : [] } article = HTMLDocument(**article_dict) article.props.html = html.tostring(doc) yield article for c in self.get_comments(article): c.is_comment = True c.parent = article yield c
def scrape_file(self, _html, t): if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = HTMLDocument() article.props.html = div article.props.headline = div.cssselect( "#articleTitle")[0].text_content() article.props.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.props.pagenr, article.props.section = self.get_pagenum( articlepage[0].text) if not div.cssselect("#sourceTitle")[0].text: article.props.medium = Medium.get_or_create("unknown medium") else: article.props.medium = Medium.get_or_create( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.props.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def _get_units(self): for section in self.sections: page = 1 url = self.page_url.format(**locals()) date = _date.today() ipage = self.getdoc(url) while date >= self.options['date']: if not ipage.cssselect("#main ul.snelnieuws_list li.item"): print("\nNo articles found as far back as given date\n") break for unit in ipage.cssselect( '#main ul.snelnieuws_list li.item'): href = unit.cssselect('a')[0].get('href') article = HTMLDocument(url=href) article.prepare(self) try: date = readDate( article.doc.cssselect("span.datum") [0].text).date() except IndexError: continue if date == self.options['date']: yield article elif date < self.options['date']: break page += 1 nxt_url = self.page_url.format(**locals()) ipage = self.getdoc(nxt_url)
def _get_units(self): self.open("http://www.powned.tv") self.open("http://cookies.publiekeomroep.nl/accept/") d = self.options['date'] docs = [] for x in range(d.day - 7, d.day + 7): archive_url = ARCHIVE_URL.format(**locals()) try: doc = self.getdoc(archive_url) except HTTPError: pass else: docs.append(doc) entries = set([]) for doc in docs: for li in doc.cssselect("ul.articlelist li"): _date = readDate( " ".join(li.cssselect("span.t")[0].text.split()[:2]) + " " + str(self.options['date'].year)).date() url = urljoin(archive_url, li.cssselect("a")[0].get('href')) entries.add((_date, url)) for _date, url in entries: if _date == self.options['date']: article = HTMLDocument(date=_date, url=url) yield article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [ div for div in _html.cssselect("#sort div") if "sort_" in div.get('id') ] for div in divs: article = Article(metastring={}) article.metastring['html'] = div article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum( articlepage[0].text) article.medium = self.get_medium( div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.date = readDate(date_str) except ValueError: log.error( "parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def get_article(self, page): span = page.doc.cssselect("#detail_content span.author")[0] page.props.date = readDate(tostring(span).split("<br/>")[1]) try: page.props.author = span.cssselect("a")[0].text except IndexError: try: page.props.author = tostring(span).split("<br/>")[0].split( "oor:")[1].strip()[0:98] except IndexError: page.props.author = "unknown" try: page.props.source = tostring(span).split("<br/>")[1].split( "bron:")[1] except IndexError: pass page.props.headline = page.doc.cssselect("h1")[0].text try: page.props.text = [ page.doc.cssselect("#detail_content p.intro")[0], page.doc.cssselect("section.clear")[0] ] except IndexError: page.props.text = page.doc.cssselect("#detail_content")[0] return page
def _scrape_unit(self, page): page.prepare(self) page.doc = self.getdoc(page.props.url) author = page.doc.cssselect("div.nieuws_box p")[2] for script in author.cssselect("script"): script.drop_tree() try: page.props.author = author.cssselect("a")[0].text except IndexError: page.props.author = author.text_content().split(":")[1].strip() if len(page.props.author) >= 99: page.props.author = "author protected" page.props.headline = page.doc.cssselect( "#container_content div.content h2")[0].text page.props.text = page.doc.cssselect("div.nieuws_tekst")[0] info = page.doc.cssselect("div.nieuws_box p") for p in info: if "Plaatsingsdatum" in p.cssselect("b")[0].text: page.props.date = readDate(p.text_content().split(":")[1]) break for comment in self.scrape_comments(page): comment.is_comment = True yield comment yield page
def scrape_media(self,doc,_type): scrn = HTMLDocument() scrn.doc = doc try: scrn.props.text = scrn.doc.cssselect("div.mediaDescription")[0] except IndexError: scrn.props.text = "none" try: scrn.props.headline = "{} {}".format(scrn.doc.cssselect("div.screenshotAppName")[0].text,_type) except IndexError: scrn.props.headline = "unknown" author_url = "/".join(scrn.doc.cssselect("div.linkAuthor a")[0].get('href').split("/")[:-2]) scrn = self.get_author_props(scrn, author_url) for obj in scrn.doc.cssselect("div.rightDetailsBlock div.detailsStatRight"): try: scrn.props.date = readDate(obj.text) except ValueError: continue else: break if not scrn.doc.cssselect("div.commentthread_paging"): yield scrn;return if not scrn.doc.cssselect("div.commentthread_header div.commentthread_paging span")[1].text_content(): for comment in self.scrape_comments(scrn): yield comment else: raise NotImplementedError yield scrn
def _scrape_unit(self, article): article.prepare(self) article.props.date = readDate(article.doc.cssselect("#datetime")[0].text_content()) article.props.section = " > ".join(article.props.url.split("/")[4:-1]) article.props.headline = article.doc.cssselect("#headline")[0].text_content().strip() article.props.text = article.doc.cssselect("#teaser") + article.doc.cssselect("#main > p") yield article
def scrape_1(self, _html, t): """format of mostly 2013""" if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] else: raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.") for div in divs: article = Article(metastring=div.text_content()) article.headline = div.cssselect("#articleTitle")[0].text_content() article.text = div.cssselect("#articleIntro")[0].text_content() articlepage = div.cssselect("#articlePage") if articlepage: article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content()) article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content()) date_str = div.cssselect("#articleDate")[0].text_content() try: article.date = readDate(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def _scrape_unit(self, article_id): article = HTMLDocument(url = self.article_url.format(**locals())) article.prepare(self) article.props.text = article.doc.cssselect("font.artbody") if len("".join([t.text_content() for t in article.props.text])) < 100: return for i, table in enumerate(article.doc.cssselect("table")): if table.get('class') == "body": table_after_body = article.doc.cssselect("table")[i + 1] page_date = re.search( "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})", table_after_body.text_content()) article.props.pagenr = page_date.group(1) article.props.date = readDate(page_date.group(2)) article.props.section = self.current_section article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip() if article.doc.cssselect(".artsubheader"): article.props.byline = article.doc.cssselect(".artsubheader")[0] if article.doc.cssselect("td.artauthor"): article.props.author = article.doc.cssselect("td.artauthor")[0].text.split(":")[1].strip() dateline_match = re.search( "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n", "\n".join([n.text_content() for n in article.props.text]).strip()) if dateline_match: article.props.dateline = dateline_match.group(1) yield article
def get(self, page): medium = unicode(getm(page.props.mid)).lower() headline = page.props.headline date = toolkit.readDate(headline).strftime('%a %d %b %Y') sid = NAME_SERIES_MAP[medium] url = SEARCH_URL % (sid, urllib.quote(date)) try: episode = EPISODE_RE.search(self.getdoc(url, lxml=False)).groups(1)[0] except: #print(url) return [] page.props.episode_url = urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode) url = OEMBED_URL % urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode) page.props.embed_url = url #print(self.getdoc(url, lxml=False)) page.props.embed_flash = OBJECT_RE.search(self.getdoc(url, lxml=False)).groups()[0] del page.props.headline del page.props.mid return [page,]
def filters_from_form(form_data): if form_data.get('datetype') == 'on': d = readDate(form_data.get('on_date')) yield 'start_date', d.isoformat() yield 'end_date', (d + relativedelta(days=1)).isoformat() elif form_data.get('datetype') == 'between': yield 'start_date', form_data.get('start_date') yield 'end_date', form_data.get('end_date') elif form_data.get('datetype') == 'after': yield 'start_date', form_data.get('start_date') elif form_data.get('datetype') == 'before': yield 'end_date', form_data.get('end_date') for k in form_data.keys(): if k in FILTER_FIELDS: try: vals = form_data.getlist(k) except AttributeError: vals = form_data[k] # make sure vals is a list if isinstance(vals, (str, unicode)) or not isinstance(vals, collections.Iterable): vals = [vals] vals = [_serialize(v) for v in vals if v] if vals: yield FILTER_FIELDS[k], vals if 'articlesets' not in form_data: # filter on all sets in project p = Project.objects.get(pk=form_data['projects']) sets = [s.id for s in p.all_articlesets()] yield "sets", sets
def scrape_2(self, _html): """New format as of 2014 and a few days before""" title = _html.cssselect("h1")[0] if not title.text: title = title.cssselect("span")[0] docdate = readDate(title.text.split("-")[1]) # split body by <hr> items = [] item = [] if len(_html.cssselect("body > hr")) == 0: # select MS Word div wrapper tags = _html.cssselect("body > div.WordSection1 > *") if len(tags) == 0: raise ParseError("Document format is not supported") else: tags = _html.cssselect("body > *") for child in tags: if child.tag == "hr" or (child.tag == "div" and child.cssselect("span > hr")): items.append(item) item = [] else: item.append(child) # first item is the index items = items[1:] for item in items: article = self.parse_item(item) if not article.date: article.date = docdate yield article
def scrape_2(self, _html): """New format as of 2014 and a few days before""" docdate = readDate(_html.cssselect("h1")[0].text.split("-")[1]) #split body by <hr> items = [] item = [] if len(_html.cssselect("body > *")) == 1: tags = _html.cssselect( "body > div > *") #extra div wrapper as of 2014-04-08 else: tags = _html.cssselect("body > *") for child in tags: if child.tag == "hr": items.append(item) item = [] else: item.append(child) #first item is the index items = items[1:] for item in items: article = self.parse_item(item) if not article.date: article.date = docdate yield article
def _get_units(self): for section in self.sections: page = 1 url = self.page_url.format(**locals()) date = _date.today() ipage = self.getdoc(url) while date >= self.options['date']: if not ipage.cssselect("#main ul.snelnieuws_list li.item"): print("\nNo articles found as far back as given date\n") break for unit in ipage.cssselect('#main ul.snelnieuws_list li.item'): href = unit.cssselect('a')[0].get('href') article = HTMLDocument(url=href) article.prepare(self) try: date = readDate(article.doc.cssselect("span.datum")[0].text).date() except IndexError: continue if date == self.options['date']: yield article elif date < self.options['date']: break page += 1 nxt_url = self.page_url.format(**locals()) ipage = self.getdoc(nxt_url)
def _get_units(self): for page in self.search_result_pages(): n = 0 for table in page.cssselect("#containerContent table"): try: onclick = table.cssselect("td.result a")[0].get('onclick') except IndexError: continue article_id = onclick.split("('")[1].split("',")[0] try: right_td = [ td for td in table.cssselect("td") if td.get('align') == 'right' ][0] date = readDate(right_td.text_content()) except IndexError: continue n += 1 footer = table.cssselect("span i nobr")[0].text_content() pagenr_section_pattern = re.compile( "\({self.paper_full_name} +([a-zA-Z ]+) +, blz ([0-9]+)\)". format(**locals())) section, pagenr = pagenr_section_pattern.search( footer).groups() headline = table.cssselect( "td.result a")[0].text_content().strip() yield (headline, date, pagenr, section.strip(), self.pdf_url.format(**locals())) if n == 0: break
def test_readdate(self): for s, date, american, lax in ( ("22 maart 1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True), ("22 mrt 1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True), ("22/3/1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True), ("1980-3-22" , datetime.datetime(1980, 3, 22,0,0,0), False, True), ("1980-3-22T01:00:05" , datetime.datetime(1980, 3, 22,1,0,5), False, True), ("1980-3-22 01:00" , datetime.datetime(1980, 3, 22,1,0,0), False, True), ("1980-3-22 01:00 PM" , datetime.datetime(1980, 3, 22,13,0,0), False, True), ("1980-3-22 01:00:00:00" , datetime.datetime(1980, 3, 22,0,0,0), False, True), #time->0 ("1980-13-22 01:00:00:00" , None, False, True), # illegal date --> None ("1980-13-22 01:00:00" , ValueError, False, False), # illegal date --> Error ("1980-3-22 27:00:00" , ValueError, False, False), # illegal time --> Error ("1980-3-22 23:00:00:00" , ValueError, False, False), # illegal time --> Error ("Sun Sep 29 18:21:12 +0000 2013", datetime.datetime(2013,9,29,18,21,12), False, False), # twitter (??) ("1/1/98", datetime.datetime(1998, 1, 1,0,0,0), False, True), ("1/1/04", datetime.datetime(2004, 1, 1,0,0,0), False, True), ("31/12/72", datetime.datetime(1972, 12, 31,0,0,0), False, True), ("12/31/72", datetime.datetime(1972, 12, 31,0,0,0), True, True), ("1/2/1972", datetime.datetime(1972, 2, 1,0,0,0), False, True), ("1/2/1972", datetime.datetime(1972, 1, 2,0,0,0), True, True), ("1/2/1972", datetime.datetime(1972, 1, 2,0,0,0), True, True), ("30.09.2008", datetime.datetime(2008, 9, 30,0,0,0), False, False), ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0), False, True), ("December 31, 2009 Thursday", datetime.datetime(2009, 12, 31, 0, 0, 0), False, False), (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0), False, False), ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0), False, False), ('September 1, 2008 Monday 12:44 PM AEST', datetime.datetime(2008, 9, 1, 12, 44), False, False), ): if inspect.isclass(date) and issubclass(date, Exception): self.assertRaises(date, toolkit.readDate, s, lax=False, american=american) else: date2 = toolkit.readDate(s, lax=lax, american=american) self.assertEqual(date2, date)
def get(self, page): medium = unicode(getm(page.props.mid)).lower() headline = page.props.headline date = toolkit.readDate(headline).strftime('%a %d %b %Y') sid = NAME_SERIES_MAP[medium] url = SEARCH_URL % (sid, urllib.quote(date)) try: episode = EPISODE_RE.search(self.getdoc(url, lxml=False)).groups(1)[0] except: #print(url) return [] page.props.episode_url = urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode) url = OEMBED_URL % urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode) page.props.embed_url = url #print(self.getdoc(url, lxml=False)) page.props.embed_flash = OBJECT_RE.search(self.getdoc( url, lxml=False)).groups()[0] del page.props.headline del page.props.mid return [ page, ]
def _get_units(self): """ PhpBB forum scraper """ index = self.getdoc(self.index_url) for cat_title, cat_doc in self.get_categories(index): for page in self.get_pages(cat_doc): for fbg in page.cssselect('.forumbg'): for li in fbg.cssselect('.topics > li'): url = urljoin( self.index_url, li.cssselect("a.topictitle")[0].get('href')) _date = etree.tostring( li.cssselect("dd.lastpost")[0]).split("br />")[1] date = toolkit.readDate(_date) yield { 'date': date, 'object': HTMLDocument( headline=li.cssselect("a.topictitle")[0].text, url=url, category=cat_title) }
def _scrape_unit(self, thread): fipo = True # First post thread.doc = self.getdoc(thread.props.url) for page in self.get_pages(thread.doc): for post in page.cssselect('.post'): ca = thread if fipo else thread.copy(parent=thread) ca.props.date = toolkit.readDate( post.cssselect('.author')[0].text_content()[-22:]) ca.props.text = post.cssselect('.content') title = post.cssselect('.postbody h3 a')[0].text if fipo: optitle = title if title: ca.props.headline = title else: ca.props.headline = 're: {}'.format(optitle) try: ca.props.author = post.cssselect( '.author strong')[0].text_content() except: try: ca.props.author = post.cssselect( '.author a')[0].text_content() except: # Least reliable method ca.props.author = post.cssselect( '.author')[0].text_content().split()[0] yield ca fipo = False
def get_article(self, page): postinfo = page.doc.cssselect("div.postInfo")[0].text page.props.date = readDate(postinfo.split(" op ")[1].split(",")[0]) page.props.headline = page.doc.cssselect("div.postInner h1")[0].text_content() page.props.text = page.doc.cssselect("div.postEntry")[0] page.props.author = postinfo.split(" op ")[0].split("Door")[1] return page
def filters_from_form(form_data): if form_data.get('datetype') == 'on': d = readDate(form_data.get('on_date')) yield 'start_date', d.isoformat() yield 'end_date', (d + relativedelta(days=2)).isoformat() elif form_data.get('datetype') == 'between': yield 'start_date', form_data.get('start_date') yield 'end_date', form_data.get('end_date') elif form_data.get('datetype') == 'after': yield 'start_date', form_data.get('start_date') elif form_data.get('datetype') == 'before': yield 'end_date', form_data.get('end_date') for k in form_data.keys(): if k in FILTER_FIELDS: try: vals = form_data.getlist(k) except AttributeError: vals = form_data[k] # make sure vals is a list if isinstance(vals, (str, unicode)) or not isinstance(vals, collections.Iterable): vals = [vals] vals = [_serialize(v) for v in vals if v] if vals: yield FILTER_FIELDS[k], vals if 'articlesets' not in form_data: # filter on all sets in project p = Project.objects.get(pk=form_data['projects']) sets = [s.id for s in p.all_articlesets()] yield "sets", sets
def _scrape_unit(self, thread): thread = thread['object'] fipo = True # First post thread.doc = self.getdoc(thread.props.url) for page in self.get_pages(thread.doc): for post in page.cssselect('.post'): ca = thread if fipo else thread.copy(parent=thread) ca.props.date = toolkit.readDate(post.cssselect('.author')[0].text_content()[-22:]) ca.props.text = post.cssselect('.content') title = unicode(post.cssselect('.postbody h3 a')[0].text) if fipo and title: optitle = title elif fipo: raise Exception("No op title found") if title: ca.props.headline = title else: ca.props.headline = 'Re: {}'.format( optitle ) try: ca.props.author = unicode(post.cssselect('.author strong')[0].text_content()) except: try: ca.props.author = unicode(post.cssselect('.author a')[0].text_content()) except: # Least reliable method ca.props.author = unicode(post.cssselect('.author')[0].text_content().split()[0]) yield ca fipo = False
def _scrape_unit(self, url): article = HTMLDocument(url=url) article.prepare(self) content = article.doc.cssselect("#content-column")[0] article.props.date = readDate(content.cssselect("p.article-meta")[0].text.split("|")[1]) article.props.headline = content.cssselect("h1")[0].text for x in [ content.cssselect("h1")[0], content.cssselect("p.article-meta")[0], content.cssselect("p.sharing")[0] ]: x.drop_tree() article.props.text = content.text_content() for block in article.doc.cssselect("#aside-column div.block"): title = block.cssselect("h2")[0].text if "Verantwoordelijk" in title and "ministerie" in title: article.props.author = "; ".join([a.text for a in block.cssselect("ul.list-common li a")]) break try: if len(article.props.author) > 100: article.props.author = article.props.author[:100] except AttributeError: pass yield article
def get_article_dict(art, sets=None): date = art.date if isinstance(art.date, (str, unicode)): date = toolkit.readDate(date) date = date.isoformat() d = dict( # dublin core elements id=art.id, headline=_clean(art.headline), text=_clean(art.text), date=date, creator=_clean(art.author), # other elements projectid=art.project_id, mediumid=art.medium_id, medium=art.medium.name, byline=_clean(art.byline), section=_clean(art.section), page=art.pagenr, addressee=_clean(art.addressee), length=art.length, sets=sets) d['hash'] = _get_hash(d) return d
def scrape_file(self, _html, t): if "werkmap" in t: divs = _html.cssselect("#articleTable div") elif "intranet/rss" in t: divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')] for div in divs: article = HTMLDocument() article.props.html = div article.props.headline = div.cssselect("#articleTitle")[0].text_content() article.props.text = div.cssselect("#articleIntro")[0] articlepage = div.cssselect("#articlePage") if articlepage: article.props.pagenr, article.props.section = self.get_pagenum(articlepage[0].text) if not div.cssselect("#sourceTitle")[0].text: article.props.medium = Medium.get_or_create("unknown medium") else: article.props.medium = Medium.get_or_create(div.cssselect("#sourceTitle")[0].text) date_str = div.cssselect("#articleDate")[0].text try: article.props.date = readDate(date_str) except ValueError: log.error("parsing date \"{date_str}\" failed".format(**locals())) else: yield article
def scrape_2(self, _html): """New format as of 2014 and a few days before""" docdate = readDate(_html.cssselect("h1")[0].text.split("-")[1]) #split body by <hr> items = [] item = [] tags = set() for child in _html.cssselect("body > *"): tags.add(child.tag) if child.tag == "hr": items.append(item) item = [] else: item.append(child) #first item is the index items = items[1:] for item in items: article = self.parse_item(item) if not article.date: article.date = docdate yield article
def scrape_comments(self,page): for li in page.doc.cssselect("ul.uiList li.fbFeedbackPost"): comment = HTMLDocument(parent=page,url=page.url) comment.props.text = li.cssselect("div.postText")[0].text comment.props.author = li.cssselect("a.profileName")[0].text comment.props.date = readDate(li.cssselect("abbr.timestamp")[0].get('title')) yield comment
def _scrape_unit(self, page): page.prepare(self) page.doc = self.getdoc(page.props.url) author = page.doc.cssselect("div.nieuws_box p")[2] for script in author.cssselect("script"): script.drop_tree() try: page.props.author = author.cssselect("a")[0].text except IndexError: page.props.author = author.text_content().split(":")[1].strip() if len(page.props.author) >=99: page.props.author="author protected" page.props.headline = page.doc.cssselect("#container_content div.content h2")[0].text page.props.text = page.doc.cssselect("div.nieuws_tekst")[0] info = page.doc.cssselect("div.nieuws_box p") for p in info: if "Plaatsingsdatum" in p.cssselect("b")[0].text: page.props.date = readDate(p.text_content().split(":")[1]) break for comment in self.scrape_comments(page): comment.is_comment = True yield comment yield page
def scrape_comments(self, page): nxt = page.doc if len(nxt.cssselect("div.pages a.next")) >= 1: while len(nxt.cssselect("div.pages a.next")) >= 1: try: nxt = self.getdoc( nxt.cssselect("div.pages a.next")[0].get('href')) except ValueError: nxt = self.getdoc( urljoin( INDEX_URL, nxt.cssselect("div.pages a.next")[0].get('href'))) for li in nxt.cssselect("ol.reacties li.hidenum"): comment = HTMLDocument(parent=page) if not ("<b>Reageer als eerste op dit bericht</b>" in etree.tostring(li) or "gebruiker verwijderd" in etree.tostring(li)): try: comment.props.text = li.cssselect( "div.reactie-body")[0] comment.props.author = li.cssselect( "strong")[0].text comment.props.date = readDate( li.cssselect("span.tijdsverschil")[0].get( 'publicationdate')) except IndexError: pass else: if comment.props.date.date( ) == self.options['date']: yield comment else: for li in nxt.cssselect("ol.reacties li.hidenum"): comment = HTMLDocument(parent=page) if not "<b>Reageer als eerste op dit bericht</b>" in etree.tostring( li): try: comment.props.text = li.cssselect( "div.reactie-body")[0] comment.props.author = li.cssselect("strong")[0].text comment.props.date = readDate( li.cssselect("span.tijdsverschil")[0].get( 'publicationdate')) if comment.props.date.date() == self.options['date']: yield comment except IndexError: pass
def test_readdate(self): for s, date, american, lax in ( ("22 maart 1980", datetime.datetime(1980, 3, 22, 0, 0, 0), False, True), ("22 mrt 1980", datetime.datetime(1980, 3, 22, 0, 0, 0), False, True), ("22/3/1980", datetime.datetime(1980, 3, 22, 0, 0, 0), False, True), ("1980-3-22", datetime.datetime(1980, 3, 22, 0, 0, 0), False, True), ("1980-3-22T01:00:05", datetime.datetime(1980, 3, 22, 1, 0, 5), False, True), ("1980-3-22 01:00", datetime.datetime(1980, 3, 22, 1, 0, 0), False, True), ("1980-3-22 01:00 PM", datetime.datetime(1980, 3, 22, 13, 0, 0), False, True), ("1980-3-22 01:00:00:00", datetime.datetime(1980, 3, 22, 0, 0, 0), False, True), #time->0 ("1980-13-22 01:00:00:00", None, False, True), # illegal date --> None ("1980-13-22 01:00:00", ValueError, False, False), # illegal date --> Error ("1980-3-22 27:00:00", ValueError, False, False), # illegal time --> Error ("1980-3-22 23:00:00:00", ValueError, False, False), # illegal time --> Error ("Sun Sep 29 18:21:12 +0000 2013", datetime.datetime(2013, 9, 29, 18, 21, 12), False, False), # twitter (??) ("1/1/98", datetime.datetime(1998, 1, 1, 0, 0, 0), False, True), ("1/1/04", datetime.datetime(2004, 1, 1, 0, 0, 0), False, True), ("31/12/72", datetime.datetime(1972, 12, 31, 0, 0, 0), False, True), ("12/31/72", datetime.datetime(1972, 12, 31, 0, 0, 0), True, True), ("1/2/1972", datetime.datetime(1972, 2, 1, 0, 0, 0), False, True), ("1/2/1972", datetime.datetime(1972, 1, 2, 0, 0, 0), True, True), ("1/2/1972", datetime.datetime(1972, 1, 2, 0, 0, 0), True, True), ("30.09.2008", datetime.datetime(2008, 9, 30, 0, 0, 0), False, False), ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0), False, True), ("December 31, 2009 Thursday", datetime.datetime(2009, 12, 31, 0, 0, 0), False, False), (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0), False, False), ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0), False, False), ('September 1, 2008 Monday 12:44 PM AEST', datetime.datetime(2008, 9, 1, 12, 44), False, False), ): if inspect.isclass(date) and issubclass(date, Exception): self.assertRaises(date, toolkit.readDate, s, lax=False, american=american) else: date2 = toolkit.readDate(s, lax=lax, american=american) self.assertEqual(date2, date)
def _get_units(self): for page in self.get_pages(): for post in page.cssselect("div.post"): article = HTMLDocument( url = post.cssselect("span.h3 a")[0].get('href'), headline = post.cssselect("span.h3")[0].text_content().strip(), date = readDate(post.cssselect("span.comments span")[0].text.replace(".","-").split(" ")[1])) yield article
def get_comments(self, page): for li in page.doc.cssselect("#detail_reactions #reaction ul.clear li"): comment = HTMLDocument() comment.props.author = li.cssselect("cite")[0].text.strip() comment.props.text = li.cssselect("blockquote")[0] comment.props.date = readDate(li.cssselect("span.time")[0].text) comment.parent = page yield comment
def get_comments(self,page): for article in page.doc.cssselect("#comments article"): comment = HTMLDocument(parent = page) footer = article.cssselect("footer")[0].text_content().split(" | ") comment.props.date = readDate(footer[1]) comment.props.author = footer[0] comment.props.text = article.cssselect("p") yield comment
def get_article(self, page): page.props.date = readDate(page.doc.cssselect("#pt1")[0].text_content()) page.props.author = page.doc.cssselect("span.post_sub a.username")[0].text page.props.headline = page.doc.cssselect("div.fieldholder h1")[0].text_content() page.props.text = page.doc.cssselect("div.postmain_right")[0] page.coords='' return page