Exemplo n.º 1
0
def _is_date(string):
    try:
        toolkit.readDate(string)
    except ValueError:
        return False

    return True
Exemplo n.º 2
0
    def get_comments(self, page):
        for li in page.doc.cssselect("ul.commentlist li.comment"):
            comment = HTMLDocument()
            comment.parent = page
            try:
                dateauthor = li.cssselect("div.commentsbox")[0].text_content()
            except IndexError:
                comment.props.author = li.text_content().split(":")[0]

                comment.props.date = readDate(":".join(li.text_content().split(":")[1:2]))
                try:
                    comment.props.text = li.cssselect("div.comment-text-reply")[0]
                except UnicodeDecodeError:
                    continue
            else:
                comment.props.author = dateauthor.split("Geplaatst door")[1].split(" op ")[0]
                try:
                    li.cssselect("div.commentsbox a")[0].drop_tree()
                except:
                    pass
                comment.props.date = readDate(dateauthor.split(" op ")[1])
                try:
                    comment.props.text = li.cssselect("div.comment-text")[0]
                except UnicodeDecodeError:
                    continue
            yield comment
Exemplo n.º 3
0
    def find_start(self, n_articles):
        """Intelligently find the page at which the articles are for the given date, saves hours"""
        jump_distance = n_articles / 4.
        index = n_articles / 2
        offset = int(math.ceil((index) / 10) * 10)
        #find an article with the right date
        while True:

            offset = int(math.ceil(index / 10) * 10)
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]

            if self.options['date'] in dates:
                break
            elif self.options['date'] > dates[0]:
                index -= jump_distance
            elif self.options['date'] < dates[0]:
                index += jump_distance

            if jump_distance < 10: return 0
            jump_distance /= 2.

        #go back to first occurrence
        i = 0
        while self.options['date'] in dates:
            i += 1
            offset -= 10 * i
            if offset < 0: return 0
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]
        return offset
Exemplo n.º 4
0
    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        self.set_up()

        p = amcattest.create_test_project(owner=self.user)
        s = amcattest.create_test_set(project=p)
        a = {
            'date': datetime.datetime.now().isoformat(),
            'headline': 'Test child',
            'medium': 'Fantasy',
            'text': 'Hello Universe',
            'pagenr': 1,
            'url': 'http://example.org',
            'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af',
        }

        url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format(**locals())
        self.post(url, a, self.user)
        amcates.ES().flush()

        res = self.get(url)["results"]
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0]["headline"], a['headline'])
        self.assertEqual(toolkit.readDate(res[0]["date"]), toolkit.readDate(a['date']))
        self.assertEqual(res[0]["uuid"], a['uuid'])
Exemplo n.º 5
0
 def parse_item(self, item):
     #item: a list of html tags
     article = HTMLDocument()
     for tag in item:
         if tag.tag == "p":
             if hasattr(article.props, 'text'):
                 article.props.text.append(tag)
             else:
                 article.props.text = [tag]
         elif tag.tag == "h2":
             article.props.headline = tag.text
         elif tag.tag == "i":
             bits = tag.text.split()
             if not bits: # empty knipsel
                 return
             if "-" in bits[-1]:
                 try:
                     article.props.date = readDate(bits[-1])
                 except ValueError:
                     article.props.date = None
                 article.props.medium = self.get_medium(" ".join(bits[:-1]))
             elif bits[-1].isdigit():
                 try:
                     article.props.date = readDate(" ".join(bits[-3:]))
                 except ValueError:
                     article.props.date = None
                 article.props.medium = self.get_medium(" ".join(bits[:-3]))
             else:
                 article.props.medium = self.get_medium(" ".join(bits))
                 article.props.date = None
     return article
Exemplo n.º 6
0
    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        self.set_up()

        p = amcattest.create_test_project(owner=self.user)
        s = amcattest.create_test_set(project=p)
        a = {
            'date': datetime.datetime.now().isoformat(),
            'headline': 'Test child',
            'medium': 'Fantasy',
            'text': 'Hello Universe',
            'pagenr': 1,
            'url': 'http://example.org',
            'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af',
        }

        url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format(
            **locals())
        self.post(url, a, self.user)
        amcates.ES().flush()

        res = self.get(url)["results"]
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0]["headline"], a['headline'])
        self.assertEqual(toolkit.readDate(res[0]["date"]),
                         toolkit.readDate(a['date']))
        self.assertEqual(res[0]["uuid"], a['uuid'])
Exemplo n.º 7
0
    def get_comments(self, page):
        for li in page.doc.cssselect("ul.commentlist li.comment"):
            comment = HTMLDocument()
            comment.parent = page
            try:
                dateauthor = li.cssselect("div.commentsbox")[0].text_content()
            except IndexError:
                comment.props.author = li.text_content().split(":")[0]

                comment.props.date = readDate(":".join(
                    li.text_content().split(":")[1:2]))
                try:
                    comment.props.text = li.cssselect(
                        "div.comment-text-reply")[0]
                except UnicodeDecodeError:
                    continue
            else:
                comment.props.author = dateauthor.split(
                    "Geplaatst door")[1].split(" op ")[0]
                try:
                    li.cssselect("div.commentsbox a")[0].drop_tree()
                except:
                    pass
                comment.props.date = readDate(dateauthor.split(" op ")[1])
                try:
                    comment.props.text = li.cssselect("div.comment-text")[0]
                except UnicodeDecodeError:
                    continue
            yield comment
Exemplo n.º 8
0
    def _scrape_unit(self, li):
        a = li.cssselect("li > a")[0]
        article = HTMLDocument(url=urljoin(self.index_url, a.get('href')))
        article.props.headline = a.text
        article.props.kicker = li.cssselect("div.infoboard a.kicker")[0].text
        article.props.intro = li.cssselect("p")
        article.props.date = readDate(
            li.cssselect("div.infoboard span.time")[0].text_content())
        article.prepare(self)
        articletime = article.doc.cssselect("p.articletime")[0].text_content()
        if len(articletime.split("|")) > 2:
            article.props.date = readDate(" ".join(
                articletime.split("|")[:-1]))
            article.props.author = articletime.split("|")[-1]
        else:
            article.props.author = articletime.strip()
            if " Korrespondent" in article.props.author:
                article.props.author = article.props.author.split(
                    "Korrespondent")[1].strip()

        for ad in article.doc.cssselect("div.noprint"):
            ad.drop_tree()
        article.props.text = article.doc.cssselect(
            "p.articlelead, #articletext")
        article.props.section = article.doc.cssselect(
            "div.headtop span.sitetop")[0].text_content()
        yield article
Exemplo n.º 9
0
    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        a = test_article()
        
        res = self._post_articles(a)
        self.assertEqual(set(res.keys()), {'id'}) # POST should only return IDs

        res = self._get_article(aid=res['id'])
        self.assertEqual(res["headline"], a['headline'])
        self.assertEqual(toolkit.readDate(res["date"]), toolkit.readDate(a['date']))
        self.assertNotIn("text", res.keys())
        self.assertIsNotNone(res["uuid"])
        
        res = self._get_article(aid=res['id'], text=True)
        self.assertEqual(res["text"], a['text'])

        res = self._get_articles()["results"]
        self.assertEqual(len(res), 1)
       

        # can we post explicit UUID?
        self.setUp_set()
        a['uuid'] = str(uuid4())
        self._post_articles(a)
        res = self._get_articles()["results"]
        self.assertEqual(res[0]["uuid"], a['uuid'])
Exemplo n.º 10
0
    def test_get(self):
        p1 = amcattest.create_test_project(name="testnaam", description="testdescription", insert_date='2012-01-01')

        actual = self.get(ProjectResource, id=p1.id)

        actual_results = actual.pop("results")
        self.assertEqual(len(actual_results), 1)
        actual_results = actual_results[0]

        date = actual_results.pop('insert_date')
        readDate(date)# check valid date, not much more to check here?

        expected_results={u'insert_user': p1.insert_user.id,
                          u'description': 'testdescription',
                          u'name': u'testnaam',
                          u'guest_role': 11,
                          u'owner': p1.owner.id,
                          u'active': True,
                           u'id': p1.id,
                          u'favourite' : False,
        }

        expected_meta = {
            u'page' : 1,
            u'next' : None,
            u'previous' : None,
            u'per_page' : 10,
            u'total' : 1,
            u'pages' : 1,
            u'echo' : None,
            }

        self.assertDictsEqual(actual, expected_meta)
        self.assertDictsEqual(actual_results, expected_results)
Exemplo n.º 11
0
    def _scrape_unit(self, page): 
        page.prepare(self)
        if page.doc.cssselect("form#_caps_form"):
            return
        header = page.doc.cssselect("div.time_post")[0].text_content()
        pattern = re.compile(r'(Bewerkt door:)?([a-zA-Z0-9 ]+)?(\u2212)?\n((\d{2,2}/){2,2}\d{2,2}), \d{2,2}:\d{2,2}\n(\xa0\u2212\xa0bron: ([A-Za-z0-9 ,]+))?')
        try:
            groups = pattern.search(header).groups()
        except AttributeError: #rare error where regex fails
            page.props.date = readDate(header)
        else:
            page.props.date = readDate(groups[3])
            if groups[0] or (not groups[1]):
                page.props.author = groups[-1]
            elif groups[1]:
                page.props.author = groups[1]

        if not hasattr(page.props,"author") and page.doc.cssselect("span.author"):
            page.props.author = page.doc.cssselect("span.author")[0].text_content()

        if hasattr(page.props,"author"):
            if page.props.author:
                page.props.author = page.props.author[:98]

        page.props.text = page.doc.cssselect("#art_box2 p")
        page.props.html = html.tostring(page.doc)
        try:
            page.props.section = page.doc.cssselect("#subnav_nieuws li span.nieuws")[0].text_content()
        except IndexError:
            if page.doc.cssselect("div.dos_default h2"):
                page.props.section = "dossier: {}".format(page.doc.cssselect("div.dos_default h2")[0].text)
        yield page
Exemplo n.º 12
0
def _is_date(string):
    try:
        toolkit.readDate(string)
    except ValueError:
        return False

    return True
Exemplo n.º 13
0
    def find_start(self, n_articles):
        """Intelligently find the page at which the articles are for the given date, saves hours"""
        jump_distance = n_articles / 4.
        index = n_articles / 2
        offset = int(math.ceil((index) / 10) * 10)
        #find an article with the right date
        while True:

            offset = int(math.ceil(index / 10) * 10)
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]

            if self.options['date'] in dates:
                break
            elif self.options['date'] > dates[0]:
                index -= jump_distance
            elif self.options['date'] < dates[0]:
                index += jump_distance

            if jump_distance < 10: return 0
            jump_distance /= 2.

        #go back to first occurrence
        i = 0
        while self.options['date'] in dates:
            i += 1
            offset -= 10 * i 
            if offset < 0: return 0
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]
        return offset
Exemplo n.º 14
0
 def test_dates(self):
     """Test whether date deserialization works, see #66"""
     for d in ('2001-01-01', '1992-12-31T23:59', '2012-02-29T12:34:56.789', datetime.datetime.now()):
         a = amcattest.create_test_article(date=d)
         amcates.ES().flush()
         res = self.get("/api/v4/search", ids=a.id)
         self.assertEqual(toolkit.readDate(res['results'][0]['date']), toolkit.readDate(str(d)))
Exemplo n.º 15
0
 def parse_dateline(self, text, article):
     bits = text.split()
     if "-" in bits[-1]:
         article.date = readDate(bits[-1])
         article.medium = self.get_medium(" ".join(bits[:-1]))
     elif bits[-1].isdigit() and bits[-3].isdigit():
         article.date = readDate(" ".join(bits[-3:]))
         article.medium = self.get_medium(" ".join(bits[:-3]))
     else:
         article.medium = self.get_medium(" ".join(bits))
         article.date = None
     return article
Exemplo n.º 16
0
 def get_comments(self, doc):
     for div in doc.cssselect("#commentsList div.topDivider"):
         comment = HTMLDocument()
         comment.props.text = div.cssselect("div.wordBreak")[0]
         spans = div.cssselect("div.fBold span")
         try:
             comment.props.date = readDate(spans[1].text_content().split(" ")[1])
         except ValueError:
             comment.props.date = readDate(spans[1].text_content())
         comment.props.author = spans[0].text_content().strip()
         comment.props.url = doc.url
         yield comment
 def parse_dateline(self, text, article):
     bits = text.split()
     if "-" in bits[-1]:
         article.date = readDate(bits[-1])
         article.medium = self.get_medium(" ".join(bits[:-1]))
     elif bits[-1].isdigit():
         article.date = readDate(" ".join(bits[-3:]))
         article.medium = self.get_medium(" ".join(bits[:-3]))
     else:
         article.medium = self.get_medium(" ".join(bits))
         article.date = None
     return article
Exemplo n.º 18
0
 def get_comments(self, doc):
     for div in doc.cssselect("#commentsList div.topDivider"):
         comment = HTMLDocument()
         comment.props.text = div.cssselect("div.wordBreak")[0]
         spans = div.cssselect("div.fBold span")
         try:
             comment.props.date = readDate(
                 spans[1].text_content().split(" ")[1])
         except ValueError:
             comment.props.date = readDate(spans[1].text_content())
         comment.props.author = spans[0].text_content().strip()
         yield comment
Exemplo n.º 19
0
 def _extract(self, doc):
     #get articles from section page. return False if out of date bounds
     for li in doc.cssselect("#content ul li"):
         if "short-news" in doc.url:
             url = li.cssselect("div.text-holder a")[0].get('href')
             date = readDate(self.getdoc(url).cssselect("#content em.date a")[0].text)
         else:
             url = li.cssselect("div.heading a")[0].get('href')
             date = readDate(li.cssselect("em.date a")[0].text)
         if date.date() < self.options['date']:
             yield False
         if date.date() == self.options['date']:
             yield url
Exemplo n.º 20
0
 def _get_units(self):
     initial_url = self.search_url.format(p=1)
     initial_doc = self.getdoc(initial_url)
     dates = [
         readDate(article.cssselect("span.date")[0].text).date()
         for article in initial_doc.cssselect("div.subarticle")
     ]
     self.maxdate = max(dates)
     n_results = int(initial_doc.cssselect("#searchlist header h1")[0].text.strip().split(" ")[-1])
     for page in self.pinpoint_pages(n_results):
         for div in page.cssselect("div.subarticle"):
             date = readDate(div.cssselect("span.date")[0].text).date()
             if date == self.options["date"]:
                 url = div.cssselect("h2 a")[0].get("href")
                 yield url
Exemplo n.º 21
0
def get_article_dict(art, sets=None):
    date = art.date
    if date:
        if isinstance(art.date, (str, unicode)):
            date = toolkit.readDate(date)
        date = date.isoformat()
    d = dict(
        # dublin core elements
        id = art.id,
        headline=_clean(art.headline),
        text=_clean(art.text),
        date=date,
        creator=_clean(art.author),

        # other elements
        projectid=art.project_id,
        mediumid=art.medium_id,
        medium=art.medium.name,
        byline=_clean(art.byline),
        section=_clean(art.section),
        page=art.pagenr,
        addressee=_clean(art.addressee),
        length=art.length,
        sets = sets
        )

    d['hash'] = _get_hash(d)
    return d
Exemplo n.º 22
0
    def _scrape_unit(self, topic_url):
        #navigate to last page, then navigate back until comments are no longer recent
        doc = self.getdoc(topic_url)
        headline = "".join(
            doc.cssselect("title")[0].text_content().split("-")[:-1])
        topic_date = readDate(
            doc.cssselect("span#pt1")[0].text_content().strip())
        try:
            parent = Article.objects.get(headline=headline, date=topic_date)
        except Article.MultipleObjectsReturned:  #duplicate in 99.99% of the cases
            parents = Article.objects.filter(headline=headline,
                                             date=topic_date)
            min_id = min([parent.id for parent in parents
                          ])  #deduplicate usually keeps the lowest id
            parent = parents.get(pk=min_id)
        except Article.DoesNotExist:
            parent = HTMLDocument(url=topic_url)
            parent.props.headline = headline
            parent.props.date = topic_date
            parent.props.text = doc.cssselect("div.postmain_right")[0]
            parent.props.author = doc.cssselect(
                "span.post_sub a.username")[0].text_content().strip()
            parent.props.section = self.current_section

        for post in self.get_posts(doc):
            post.props.parent = parent
            post.props.url = hasattr(
                parent, 'props') and parent.props.url or parent.url
            yield post

        if isinstance(parent, Document):
            yield parent
Exemplo n.º 23
0
    def _scrape_unit(self, article_id):
        article = HTMLDocument(url=self.article_url.format(**locals()))
        article.prepare(self)
        article.props.text = article.doc.cssselect("font.artbody")
        if len("".join([t.text_content() for t in article.props.text])) < 100:
            return
        for i, table in enumerate(article.doc.cssselect("table")):
            if table.get('class') == "body":
                table_after_body = article.doc.cssselect("table")[i + 1]
        page_date = re.search(
            "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})",
            table_after_body.text_content())
        article.props.pagenr = page_date.group(1)
        article.props.date = readDate(page_date.group(2))
        article.props.section = self.current_section
        article.props.headline = article.doc.cssselect(
            "td.artheader")[0].text_content().strip()
        if article.doc.cssselect(".artsubheader"):
            article.props.byline = article.doc.cssselect(".artsubheader")[0]
        if article.doc.cssselect("td.artauthor"):
            article.props.author = article.doc.cssselect(
                "td.artauthor")[0].text.split(":")[1].strip()
        dateline_match = re.search(
            "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n",
            "\n".join([n.text_content() for n in article.props.text]).strip())
        if dateline_match:
            article.props.dateline = dateline_match.group(1)

        yield article
Exemplo n.º 24
0
    def _scrape_unit(self, unit): 
        url, section = unit
        if not section:
            section = url.split("/")[3]
        doc = self.getdoc(url)

        try:
            headline = doc.cssselect("#artikel h1")[0].text_content()
        except IndexError:
            return #no headline, no article

        article_dict = {
            'url' : url,
            'text' : doc.cssselect("#broodtekst")[0],
            'headline' : headline,
            'section' : section,
            'author' : doc.cssselect("div.author") and doc.cssselect("div.author a")[0].text or None,
            'date' : readDate(doc.cssselect("#midden time")[0].get('datetime')),
            'children' : []
            }

        article = HTMLDocument(**article_dict)
        article.props.html = html.tostring(doc)
        yield article
        
        for c in self.get_comments(article):
            c.is_comment = True
            c.parent = article
            yield c
Exemplo n.º 25
0
    def scrape_file(self, _html, t):
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = HTMLDocument()
            article.props.html = div
            article.props.headline = div.cssselect(
                "#articleTitle")[0].text_content()
            article.props.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.props.pagenr, article.props.section = self.get_pagenum(
                    articlepage[0].text)

            if not div.cssselect("#sourceTitle")[0].text:
                article.props.medium = Medium.get_or_create("unknown medium")
            else:
                article.props.medium = Medium.get_or_create(
                    div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.props.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Exemplo n.º 26
0
    def _get_units(self):
        for section in self.sections:
            page = 1
            url = self.page_url.format(**locals())

            date = _date.today()
            ipage = self.getdoc(url)
            while date >= self.options['date']:
                if not ipage.cssselect("#main ul.snelnieuws_list li.item"):
                    print("\nNo articles found as far back as given date\n")
                    break
                for unit in ipage.cssselect(
                        '#main ul.snelnieuws_list li.item'):
                    href = unit.cssselect('a')[0].get('href')
                    article = HTMLDocument(url=href)
                    article.prepare(self)
                    try:
                        date = readDate(
                            article.doc.cssselect("span.datum")
                            [0].text).date()
                    except IndexError:
                        continue
                    if date == self.options['date']:
                        yield article
                    elif date < self.options['date']:
                        break

                page += 1
                nxt_url = self.page_url.format(**locals())
                ipage = self.getdoc(nxt_url)
Exemplo n.º 27
0
    def _get_units(self):
        self.open("http://www.powned.tv")
        self.open("http://cookies.publiekeomroep.nl/accept/")
        d = self.options['date']
        docs = []
        for x in range(d.day - 7, d.day + 7):
            archive_url = ARCHIVE_URL.format(**locals())
            try:
                doc = self.getdoc(archive_url)
            except HTTPError:
                pass
            else:
                docs.append(doc)

        entries = set([])
        for doc in docs:
            for li in doc.cssselect("ul.articlelist li"):

                _date = readDate(
                    " ".join(li.cssselect("span.t")[0].text.split()[:2]) +
                    " " + str(self.options['date'].year)).date()
                url = urljoin(archive_url, li.cssselect("a")[0].get('href'))
                entries.add((_date, url))

        for _date, url in entries:

            if _date == self.options['date']:
                article = HTMLDocument(date=_date, url=url)
                yield article
Exemplo n.º 28
0
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = Article(metastring={})
            article.metastring['html'] = div
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.pagenr, article.section = self.get_pagenum(
                    articlepage[0].text)

            article.medium = self.get_medium(
                div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Exemplo n.º 29
0
 def get_article(self, page):
     span = page.doc.cssselect("#detail_content span.author")[0]
     page.props.date = readDate(tostring(span).split("<br/>")[1])
     try:
         page.props.author = span.cssselect("a")[0].text
     except IndexError:
         try:
             page.props.author = tostring(span).split("<br/>")[0].split(
                 "oor:")[1].strip()[0:98]
         except IndexError:
             page.props.author = "unknown"
     try:
         page.props.source = tostring(span).split("<br/>")[1].split(
             "bron:")[1]
     except IndexError:
         pass
     page.props.headline = page.doc.cssselect("h1")[0].text
     try:
         page.props.text = [
             page.doc.cssselect("#detail_content p.intro")[0],
             page.doc.cssselect("section.clear")[0]
         ]
     except IndexError:
         page.props.text = page.doc.cssselect("#detail_content")[0]
     return page
Exemplo n.º 30
0
    def _scrape_unit(self, page):

        page.prepare(self)
        page.doc = self.getdoc(page.props.url)
        author = page.doc.cssselect("div.nieuws_box p")[2]
        for script in author.cssselect("script"):
            script.drop_tree()
        try:
            page.props.author = author.cssselect("a")[0].text
        except IndexError:
            page.props.author = author.text_content().split(":")[1].strip()
        if len(page.props.author) >= 99:
            page.props.author = "author protected"

        page.props.headline = page.doc.cssselect(
            "#container_content div.content h2")[0].text
        page.props.text = page.doc.cssselect("div.nieuws_tekst")[0]
        info = page.doc.cssselect("div.nieuws_box p")
        for p in info:
            if "Plaatsingsdatum" in p.cssselect("b")[0].text:
                page.props.date = readDate(p.text_content().split(":")[1])
                break

        for comment in self.scrape_comments(page):
            comment.is_comment = True
            yield comment

        yield page
Exemplo n.º 31
0
    def scrape_media(self,doc,_type):
        scrn = HTMLDocument()
        scrn.doc = doc
        try:
            scrn.props.text = scrn.doc.cssselect("div.mediaDescription")[0]
        except IndexError:
            scrn.props.text = "none"

        try:
            scrn.props.headline = "{} {}".format(scrn.doc.cssselect("div.screenshotAppName")[0].text,_type)
        except IndexError:
            scrn.props.headline = "unknown"

        author_url = "/".join(scrn.doc.cssselect("div.linkAuthor a")[0].get('href').split("/")[:-2])
        scrn = self.get_author_props(scrn, author_url)

        for obj in scrn.doc.cssselect("div.rightDetailsBlock div.detailsStatRight"):
            try:
                scrn.props.date = readDate(obj.text)
            except ValueError:
                continue
            else:
                break

        if not scrn.doc.cssselect("div.commentthread_paging"):
            yield scrn;return
        if not scrn.doc.cssselect("div.commentthread_header div.commentthread_paging span")[1].text_content():
            for comment in self.scrape_comments(scrn):
                yield comment
        else:
            raise NotImplementedError

        yield scrn
Exemplo n.º 32
0
 def _scrape_unit(self, article):
     article.prepare(self)
     article.props.date = readDate(article.doc.cssselect("#datetime")[0].text_content())
     article.props.section = " > ".join(article.props.url.split("/")[4:-1])
     article.props.headline = article.doc.cssselect("#headline")[0].text_content().strip()
     article.props.text = article.doc.cssselect("#teaser") + article.doc.cssselect("#main > p")
     yield article
Exemplo n.º 33
0
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
        else:
            raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.")

        for div in divs:
            article = Article(metastring=div.text_content())
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0].text_content()
            articlepage = div.cssselect("#articlePage")

            if articlepage:
                article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content())

            article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content())
            date_str = div.cssselect("#articleDate")[0].text_content()

            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Exemplo n.º 34
0
 def _scrape_unit(self, article_id):
     article = HTMLDocument(url = self.article_url.format(**locals()))
     article.prepare(self)
     article.props.text = article.doc.cssselect("font.artbody")
     if len("".join([t.text_content() for t in article.props.text])) < 100:
         return
     for i, table in enumerate(article.doc.cssselect("table")):
         if table.get('class') == "body":
             table_after_body = article.doc.cssselect("table")[i + 1]
     page_date = re.search(
         "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})",
         table_after_body.text_content())
     article.props.pagenr = page_date.group(1)
     article.props.date = readDate(page_date.group(2))
     article.props.section = self.current_section
     article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip()
     if article.doc.cssselect(".artsubheader"):
         article.props.byline = article.doc.cssselect(".artsubheader")[0]
     if article.doc.cssselect("td.artauthor"):
         article.props.author = article.doc.cssselect("td.artauthor")[0].text.split(":")[1].strip()
     dateline_match = re.search(
         "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n",
         "\n".join([n.text_content() for n in article.props.text]).strip())
     if dateline_match:
         article.props.dateline = dateline_match.group(1)
                                       
     yield article
    def get(self, page):
        medium = unicode(getm(page.props.mid)).lower()
        headline = page.props.headline

        date = toolkit.readDate(headline).strftime('%a %d %b %Y')
        sid = NAME_SERIES_MAP[medium]

        url = SEARCH_URL % (sid, urllib.quote(date))

        try:
            episode = EPISODE_RE.search(self.getdoc(url, lxml=False)).groups(1)[0]
        except:
            #print(url)
            return []

        page.props.episode_url = urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode)
        
        url = OEMBED_URL % urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode)
        page.props.embed_url = url

        #print(self.getdoc(url, lxml=False))

        page.props.embed_flash = OBJECT_RE.search(self.getdoc(url, lxml=False)).groups()[0]

        del page.props.headline
        del page.props.mid

        return [page,]
Exemplo n.º 36
0
def filters_from_form(form_data):
    if form_data.get('datetype') == 'on':
        d = readDate(form_data.get('on_date'))
        yield 'start_date', d.isoformat()
        yield 'end_date', (d + relativedelta(days=1)).isoformat()
    elif form_data.get('datetype') == 'between':
        yield 'start_date', form_data.get('start_date')
        yield 'end_date', form_data.get('end_date')
    elif form_data.get('datetype') == 'after':
        yield 'start_date', form_data.get('start_date')
    elif form_data.get('datetype') == 'before':
        yield 'end_date', form_data.get('end_date')
        
    
    for k in form_data.keys():
        if  k in FILTER_FIELDS:
            try:
                vals = form_data.getlist(k)
            except AttributeError:
                vals = form_data[k]
                # make sure vals is a list
                if isinstance(vals, (str, unicode)) or not isinstance(vals, collections.Iterable):
                    vals = [vals]
            vals = [_serialize(v) for v in vals if v]
            if vals:
                yield FILTER_FIELDS[k], vals
                
    if 'articlesets' not in form_data:
        # filter on all sets in project
        p = Project.objects.get(pk=form_data['projects'])
        sets = [s.id for s in p.all_articlesets()]
        yield "sets", sets
Exemplo n.º 37
0
    def scrape_2(self, _html):
        """New format as of 2014 and a few days before"""
        title = _html.cssselect("h1")[0]
        if not title.text:
            title = title.cssselect("span")[0]
        docdate = readDate(title.text.split("-")[1])

        # split body by <hr>
        items = []
        item = []
        
        if len(_html.cssselect("body > hr")) == 0:
            # select MS Word div wrapper
            tags = _html.cssselect("body > div.WordSection1 > *")
            if len(tags) == 0:
                    raise ParseError("Document format is not supported")

        else:
            tags = _html.cssselect("body > *")

        for child in tags:
            if child.tag == "hr" or (child.tag == "div" and child.cssselect("span > hr")):
                items.append(item)
                item = []
            else:
                item.append(child)

        # first item is the index
        items = items[1:]
        for item in items:
            article = self.parse_item(item)
            if not article.date:
                article.date = docdate
            yield article
Exemplo n.º 38
0
    def scrape_2(self, _html):
        """New format as of 2014 and a few days before"""
        docdate = readDate(_html.cssselect("h1")[0].text.split("-")[1])

        #split body by <hr>
        items = []
        item = []
        if len(_html.cssselect("body > *")) == 1:
            tags = _html.cssselect(
                "body > div > *")  #extra div wrapper as of 2014-04-08
        else:
            tags = _html.cssselect("body > *")

        for child in tags:
            if child.tag == "hr":
                items.append(item)
                item = []
            else:
                item.append(child)

        #first item is the index
        items = items[1:]
        for item in items:
            article = self.parse_item(item)
            if not article.date:
                article.date = docdate
            yield article
Exemplo n.º 39
0
    def _get_units(self):
        for section in self.sections:
            page = 1
            url = self.page_url.format(**locals())

            date = _date.today()
            ipage = self.getdoc(url)
            while date >= self.options['date']:
                if not ipage.cssselect("#main ul.snelnieuws_list li.item"):
                    print("\nNo articles found as far back as given date\n")
                    break
                for unit in ipage.cssselect('#main ul.snelnieuws_list li.item'):
                    href = unit.cssselect('a')[0].get('href')
                    article = HTMLDocument(url=href)
                    article.prepare(self)
                    try:
                        date = readDate(article.doc.cssselect("span.datum")[0].text).date()
                    except IndexError:
                        continue
                    if date == self.options['date']: 
                        yield article
                    elif date < self.options['date']:
                        break 

                page += 1
                nxt_url = self.page_url.format(**locals())
                ipage = self.getdoc(nxt_url)
Exemplo n.º 40
0
 def _get_units(self):
     for page in self.search_result_pages():
         n = 0
         for table in page.cssselect("#containerContent table"):
             try:
                 onclick = table.cssselect("td.result a")[0].get('onclick')
             except IndexError:
                 continue
             article_id = onclick.split("('")[1].split("',")[0]
             try:
                 right_td = [
                     td for td in table.cssselect("td")
                     if td.get('align') == 'right'
                 ][0]
                 date = readDate(right_td.text_content())
             except IndexError:
                 continue
             n += 1
             footer = table.cssselect("span i nobr")[0].text_content()
             pagenr_section_pattern = re.compile(
                 "\({self.paper_full_name} +([a-zA-Z ]+) +, blz ([0-9]+)\)".
                 format(**locals()))
             section, pagenr = pagenr_section_pattern.search(
                 footer).groups()
             headline = table.cssselect(
                 "td.result a")[0].text_content().strip()
             yield (headline, date, pagenr, section.strip(),
                    self.pdf_url.format(**locals()))
         if n == 0:
             break
Exemplo n.º 41
0
 def test_readdate(self):
     for s, date, american, lax in (
         ("22 maart 1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("22 mrt 1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("22/3/1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("1980-3-22" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("1980-3-22T01:00:05" , datetime.datetime(1980, 3, 22,1,0,5), False, True),
         ("1980-3-22 01:00" , datetime.datetime(1980, 3, 22,1,0,0), False, True),
         ("1980-3-22 01:00 PM" , datetime.datetime(1980, 3, 22,13,0,0), False, True),
         ("1980-3-22 01:00:00:00" , datetime.datetime(1980, 3, 22,0,0,0), False, True), #time->0
         ("1980-13-22 01:00:00:00" , None, False, True), # illegal date --> None
         ("1980-13-22 01:00:00" , ValueError, False, False), # illegal date --> Error
         ("1980-3-22 27:00:00" , ValueError, False, False), # illegal time --> Error
         ("1980-3-22 23:00:00:00" , ValueError, False, False), # illegal time --> Error
         ("Sun Sep 29 18:21:12 +0000 2013", datetime.datetime(2013,9,29,18,21,12), False, False), # twitter (??)
         ("1/1/98", datetime.datetime(1998, 1, 1,0,0,0), False, True),
         ("1/1/04", datetime.datetime(2004, 1, 1,0,0,0), False, True),
         ("31/12/72", datetime.datetime(1972, 12, 31,0,0,0), False, True),
         ("12/31/72", datetime.datetime(1972, 12, 31,0,0,0), True, True),
         ("1/2/1972", datetime.datetime(1972, 2, 1,0,0,0), False, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2,0,0,0), True, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2,0,0,0), True, True),
         ("30.09.2008", datetime.datetime(2008, 9, 30,0,0,0), False, False),
         ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0), False, True),
         ("December 31, 2009 Thursday", datetime.datetime(2009, 12, 31, 0, 0, 0), False, False),
         (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0), False, False),
         ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0), False, False),
         ('September 1, 2008 Monday 12:44 PM AEST', datetime.datetime(2008, 9, 1, 12, 44), False, False),
         ):
         if inspect.isclass(date) and issubclass(date, Exception):
             self.assertRaises(date, toolkit.readDate, s, lax=False, american=american)
         else:
             date2 = toolkit.readDate(s, lax=lax, american=american)
             self.assertEqual(date2, date)
    def get(self, page):
        medium = unicode(getm(page.props.mid)).lower()
        headline = page.props.headline

        date = toolkit.readDate(headline).strftime('%a %d %b %Y')
        sid = NAME_SERIES_MAP[medium]

        url = SEARCH_URL % (sid, urllib.quote(date))

        try:
            episode = EPISODE_RE.search(self.getdoc(url,
                                                    lxml=False)).groups(1)[0]
        except:
            #print(url)
            return []

        page.props.episode_url = urlparse.urljoin(SEARCH_URL,
                                                  '/afleveringen/%s' % episode)

        url = OEMBED_URL % urlparse.urljoin(SEARCH_URL,
                                            '/afleveringen/%s' % episode)
        page.props.embed_url = url

        #print(self.getdoc(url, lxml=False))

        page.props.embed_flash = OBJECT_RE.search(self.getdoc(
            url, lxml=False)).groups()[0]

        del page.props.headline
        del page.props.mid

        return [
            page,
        ]
Exemplo n.º 43
0
    def _get_units(self):
        """
        PhpBB forum scraper
        """
        index = self.getdoc(self.index_url)

        for cat_title, cat_doc in self.get_categories(index):
            for page in self.get_pages(cat_doc):
                for fbg in page.cssselect('.forumbg'):
                    for li in fbg.cssselect('.topics > li'):
                        url = urljoin(
                            self.index_url,
                            li.cssselect("a.topictitle")[0].get('href'))
                        _date = etree.tostring(
                            li.cssselect("dd.lastpost")[0]).split("br />")[1]
                        date = toolkit.readDate(_date)
                        yield {
                            'date':
                            date,
                            'object':
                            HTMLDocument(
                                headline=li.cssselect("a.topictitle")[0].text,
                                url=url,
                                category=cat_title)
                        }
Exemplo n.º 44
0
    def _scrape_unit(self, thread):
        fipo = True  # First post
        thread.doc = self.getdoc(thread.props.url)
        for page in self.get_pages(thread.doc):
            for post in page.cssselect('.post'):
                ca = thread if fipo else thread.copy(parent=thread)
                ca.props.date = toolkit.readDate(
                    post.cssselect('.author')[0].text_content()[-22:])
                ca.props.text = post.cssselect('.content')

                title = post.cssselect('.postbody h3 a')[0].text
                if fipo:
                    optitle = title
                if title:
                    ca.props.headline = title
                else:
                    ca.props.headline = 're: {}'.format(optitle)

                try:
                    ca.props.author = post.cssselect(
                        '.author strong')[0].text_content()
                except:
                    try:
                        ca.props.author = post.cssselect(
                            '.author a')[0].text_content()
                    except:
                        # Least reliable method
                        ca.props.author = post.cssselect(
                            '.author')[0].text_content().split()[0]

                yield ca

                fipo = False
Exemplo n.º 45
0
 def get_article(self, page):
     postinfo = page.doc.cssselect("div.postInfo")[0].text
     page.props.date = readDate(postinfo.split(" op ")[1].split(",")[0])
     page.props.headline = page.doc.cssselect("div.postInner h1")[0].text_content()
     page.props.text = page.doc.cssselect("div.postEntry")[0]
     page.props.author = postinfo.split(" op ")[0].split("Door")[1]
     return page
Exemplo n.º 46
0
def filters_from_form(form_data):
    if form_data.get('datetype') == 'on':
        d = readDate(form_data.get('on_date'))
        yield 'start_date', d.isoformat()
        yield 'end_date', (d + relativedelta(days=2)).isoformat()
    elif form_data.get('datetype') == 'between':
        yield 'start_date', form_data.get('start_date')
        yield 'end_date', form_data.get('end_date')
    elif form_data.get('datetype') == 'after':
        yield 'start_date', form_data.get('start_date')
    elif form_data.get('datetype') == 'before':
        yield 'end_date', form_data.get('end_date')
        
    
    for k in form_data.keys():
        if  k in FILTER_FIELDS:
            try:
                vals = form_data.getlist(k)
            except AttributeError:
                vals = form_data[k]
                # make sure vals is a list
                if isinstance(vals, (str, unicode)) or not isinstance(vals, collections.Iterable):
                    vals = [vals]
            vals = [_serialize(v) for v in vals if v]
            if vals:
                yield FILTER_FIELDS[k], vals
                
    if 'articlesets' not in form_data:
        # filter on all sets in project
        p = Project.objects.get(pk=form_data['projects'])
        sets = [s.id for s in p.all_articlesets()]
        yield "sets", sets
Exemplo n.º 47
0
    def _scrape_unit(self, thread):
        thread = thread['object']
        fipo = True # First post
        thread.doc = self.getdoc(thread.props.url)
        for page in self.get_pages(thread.doc):
            for post in page.cssselect('.post'):
                ca = thread if fipo else thread.copy(parent=thread)
                ca.props.date = toolkit.readDate(post.cssselect('.author')[0].text_content()[-22:])
                ca.props.text = post.cssselect('.content')
                
                title = unicode(post.cssselect('.postbody h3 a')[0].text)
                
                if fipo and title:
                    optitle = title
                elif fipo:
                    raise Exception("No op title found")
                if title:
                    ca.props.headline = title
                else:
                    ca.props.headline = 'Re: {}'.format( optitle )

                try:
                    ca.props.author = unicode(post.cssselect('.author strong')[0].text_content())
                except:
                    try:
                        ca.props.author = unicode(post.cssselect('.author a')[0].text_content())
                    except:
                        # Least reliable method
                        ca.props.author = unicode(post.cssselect('.author')[0].text_content().split()[0])

                yield ca

                fipo = False
Exemplo n.º 48
0
    def _scrape_unit(self, url):
        article = HTMLDocument(url=url)
        article.prepare(self)

        content = article.doc.cssselect("#content-column")[0]
        article.props.date = readDate(content.cssselect("p.article-meta")[0].text.split("|")[1])
        article.props.headline = content.cssselect("h1")[0].text
        
        for x in [
            content.cssselect("h1")[0],
            content.cssselect("p.article-meta")[0],
            content.cssselect("p.sharing")[0]
        ]:
            x.drop_tree()

        article.props.text = content.text_content()

        for block in article.doc.cssselect("#aside-column div.block"):
            title = block.cssselect("h2")[0].text
            if "Verantwoordelijk" in title and "ministerie" in title:
                article.props.author = "; ".join([a.text for a in block.cssselect("ul.list-common li a")])
                break
        
        try:
            if len(article.props.author) > 100:
                article.props.author = article.props.author[:100]
        except AttributeError:
            pass
        yield article
Exemplo n.º 49
0
def get_article_dict(art, sets=None):
    date = art.date
    if isinstance(art.date, (str, unicode)):
        date = toolkit.readDate(date)
    date = date.isoformat()

    d = dict(
        # dublin core elements
        id=art.id,
        headline=_clean(art.headline),
        text=_clean(art.text),
        date=date,
        creator=_clean(art.author),

        # other elements
        projectid=art.project_id,
        mediumid=art.medium_id,
        medium=art.medium.name,
        byline=_clean(art.byline),
        section=_clean(art.section),
        page=art.pagenr,
        addressee=_clean(art.addressee),
        length=art.length,
        sets=sets)

    d['hash'] = _get_hash(d)
    return d
Exemplo n.º 50
0
    def scrape_file(self, _html, t):
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
            
        for div in divs:
            article = HTMLDocument()
            article.props.html = div
            article.props.headline = div.cssselect("#articleTitle")[0].text_content()
            article.props.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.props.pagenr, article.props.section = self.get_pagenum(articlepage[0].text)

            if not div.cssselect("#sourceTitle")[0].text:
                article.props.medium = Medium.get_or_create("unknown medium")
            else:
                article.props.medium = Medium.get_or_create(div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.props.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
    def scrape_2(self, _html):
        """New format as of 2014 and a few days before"""

        docdate = readDate(_html.cssselect("h1")[0].text.split("-")[1])

        #split body by <hr>
        items = []
        item = []
        tags = set()
        for child in _html.cssselect("body > *"):
            tags.add(child.tag)
            if child.tag == "hr":
                items.append(item)
                item = []
            else:
                item.append(child)

        #first item is the index
        items = items[1:]

        for item in items:
            article = self.parse_item(item)
            if not article.date:
                article.date = docdate
            yield article
Exemplo n.º 52
0
 def scrape_comments(self,page):
     for li in page.doc.cssselect("ul.uiList li.fbFeedbackPost"):
         comment = HTMLDocument(parent=page,url=page.url)
         comment.props.text = li.cssselect("div.postText")[0].text
         comment.props.author = li.cssselect("a.profileName")[0].text
         comment.props.date = readDate(li.cssselect("abbr.timestamp")[0].get('title'))
         yield comment
Exemplo n.º 53
0
    def _scrape_unit(self, page): 

        page.prepare(self)
        page.doc = self.getdoc(page.props.url)
        author = page.doc.cssselect("div.nieuws_box p")[2]
        for script in author.cssselect("script"):
            script.drop_tree()
        try:
            page.props.author = author.cssselect("a")[0].text
        except IndexError:
            page.props.author = author.text_content().split(":")[1].strip()
        if len(page.props.author) >=99:
            page.props.author="author protected"
        
        page.props.headline = page.doc.cssselect("#container_content div.content h2")[0].text
        page.props.text = page.doc.cssselect("div.nieuws_tekst")[0]
        info = page.doc.cssselect("div.nieuws_box p")
        for p in info:
            if "Plaatsingsdatum" in p.cssselect("b")[0].text:
                page.props.date = readDate(p.text_content().split(":")[1])
                break

            
        for comment in self.scrape_comments(page):
            comment.is_comment = True
            yield comment

        yield page
Exemplo n.º 54
0
 def scrape_comments(self, page):
     nxt = page.doc
     if len(nxt.cssselect("div.pages a.next")) >= 1:
         while len(nxt.cssselect("div.pages a.next")) >= 1:
             try:
                 nxt = self.getdoc(
                     nxt.cssselect("div.pages a.next")[0].get('href'))
             except ValueError:
                 nxt = self.getdoc(
                     urljoin(
                         INDEX_URL,
                         nxt.cssselect("div.pages a.next")[0].get('href')))
             for li in nxt.cssselect("ol.reacties li.hidenum"):
                 comment = HTMLDocument(parent=page)
                 if not ("<b>Reageer als eerste op dit bericht</b>"
                         in etree.tostring(li)
                         or "gebruiker verwijderd" in etree.tostring(li)):
                     try:
                         comment.props.text = li.cssselect(
                             "div.reactie-body")[0]
                         comment.props.author = li.cssselect(
                             "strong")[0].text
                         comment.props.date = readDate(
                             li.cssselect("span.tijdsverschil")[0].get(
                                 'publicationdate'))
                     except IndexError:
                         pass
                     else:
                         if comment.props.date.date(
                         ) == self.options['date']:
                             yield comment
     else:
         for li in nxt.cssselect("ol.reacties li.hidenum"):
             comment = HTMLDocument(parent=page)
             if not "<b>Reageer als eerste op dit bericht</b>" in etree.tostring(
                     li):
                 try:
                     comment.props.text = li.cssselect(
                         "div.reactie-body")[0]
                     comment.props.author = li.cssselect("strong")[0].text
                     comment.props.date = readDate(
                         li.cssselect("span.tijdsverschil")[0].get(
                             'publicationdate'))
                     if comment.props.date.date() == self.options['date']:
                         yield comment
                 except IndexError:
                     pass
Exemplo n.º 55
0
 def test_readdate(self):
     for s, date, american, lax in (
         ("22 maart 1980", datetime.datetime(1980, 3, 22, 0, 0,
                                             0), False, True),
         ("22 mrt 1980", datetime.datetime(1980, 3, 22, 0, 0,
                                           0), False, True),
         ("22/3/1980", datetime.datetime(1980, 3, 22, 0, 0,
                                         0), False, True),
         ("1980-3-22", datetime.datetime(1980, 3, 22, 0, 0,
                                         0), False, True),
         ("1980-3-22T01:00:05", datetime.datetime(1980, 3, 22, 1, 0,
                                                  5), False, True),
         ("1980-3-22 01:00", datetime.datetime(1980, 3, 22, 1, 0,
                                               0), False, True),
         ("1980-3-22 01:00 PM", datetime.datetime(1980, 3, 22, 13, 0,
                                                  0), False, True),
         ("1980-3-22 01:00:00:00", datetime.datetime(1980, 3, 22, 0, 0, 0),
          False, True),  #time->0
         ("1980-13-22 01:00:00:00", None, False,
          True),  # illegal date --> None
         ("1980-13-22 01:00:00", ValueError, False,
          False),  # illegal date --> Error
         ("1980-3-22 27:00:00", ValueError, False,
          False),  # illegal time --> Error
         ("1980-3-22 23:00:00:00", ValueError, False,
          False),  # illegal time --> Error
         ("Sun Sep 29 18:21:12 +0000 2013",
          datetime.datetime(2013, 9, 29, 18, 21,
                            12), False, False),  # twitter (??)
         ("1/1/98", datetime.datetime(1998, 1, 1, 0, 0, 0), False, True),
         ("1/1/04", datetime.datetime(2004, 1, 1, 0, 0, 0), False, True),
         ("31/12/72", datetime.datetime(1972, 12, 31, 0, 0,
                                        0), False, True),
         ("12/31/72", datetime.datetime(1972, 12, 31, 0, 0, 0), True, True),
         ("1/2/1972", datetime.datetime(1972, 2, 1, 0, 0, 0), False, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2, 0, 0, 0), True, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2, 0, 0, 0), True, True),
         ("30.09.2008", datetime.datetime(2008, 9, 30, 0, 0,
                                          0), False, False),
         ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0,
                                               0), False, True),
         ("December 31, 2009 Thursday",
          datetime.datetime(2009, 12, 31, 0, 0, 0), False, False),
         (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0,
                                                0), False, False),
         ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0,
                                              0), False, False),
         ('September 1, 2008 Monday 12:44 PM AEST',
          datetime.datetime(2008, 9, 1, 12, 44), False, False),
     ):
         if inspect.isclass(date) and issubclass(date, Exception):
             self.assertRaises(date,
                               toolkit.readDate,
                               s,
                               lax=False,
                               american=american)
         else:
             date2 = toolkit.readDate(s, lax=lax, american=american)
             self.assertEqual(date2, date)
Exemplo n.º 56
0
 def _get_units(self):
     for page in self.get_pages():
         for post in page.cssselect("div.post"):
             article = HTMLDocument(
                 url = post.cssselect("span.h3 a")[0].get('href'),
                 headline = post.cssselect("span.h3")[0].text_content().strip(),
                 date = readDate(post.cssselect("span.comments span")[0].text.replace(".","-").split(" ")[1]))
             yield article
Exemplo n.º 57
0
 def get_comments(self, page):
     for li in page.doc.cssselect("#detail_reactions #reaction ul.clear li"):
         comment = HTMLDocument()
         comment.props.author = li.cssselect("cite")[0].text.strip()
         comment.props.text = li.cssselect("blockquote")[0]
         comment.props.date = readDate(li.cssselect("span.time")[0].text)
         comment.parent = page
         yield comment
Exemplo n.º 58
0
 def get_comments(self,page):
     for article in page.doc.cssselect("#comments article"):
         comment = HTMLDocument(parent = page)
         footer = article.cssselect("footer")[0].text_content().split(" | ")
         comment.props.date = readDate(footer[1])
         comment.props.author = footer[0]
         comment.props.text = article.cssselect("p")
         yield comment
Exemplo n.º 59
0
 def get_article(self, page):
     page.props.date = readDate(page.doc.cssselect("#pt1")[0].text_content())
     page.props.author = page.doc.cssselect("span.post_sub a.username")[0].text
     page.props.headline = page.doc.cssselect("div.fieldholder h1")[0].text_content()
     
     page.props.text = page.doc.cssselect("div.postmain_right")[0]
     page.coords=''
     return page