Python readDate示例，amcat.tools.toolkit.readDate Python示例

示例#1

0

显示文件

文件： lexisnexis.py 项目： pombredanne/amcat

def _is_date(string):
    try:
        toolkit.readDate(string)
    except ValueError:
        return False

    return True

示例#2

0

显示文件

文件： gezondheid_blog.py 项目： ToonAlfrink/amcatscraping

    def get_comments(self, page):
        for li in page.doc.cssselect("ul.commentlist li.comment"):
            comment = HTMLDocument()
            comment.parent = page
            try:
                dateauthor = li.cssselect("div.commentsbox")[0].text_content()
            except IndexError:
                comment.props.author = li.text_content().split(":")[0]

                comment.props.date = readDate(":".join(li.text_content().split(":")[1:2]))
                try:
                    comment.props.text = li.cssselect("div.comment-text-reply")[0]
                except UnicodeDecodeError:
                    continue
            else:
                comment.props.author = dateauthor.split("Geplaatst door")[1].split(" op ")[0]
                try:
                    li.cssselect("div.commentsbox a")[0].drop_tree()
                except:
                    pass
                comment.props.date = readDate(dateauthor.split(" op ")[1])
                try:
                    comment.props.text = li.cssselect("div.comment-text")[0]
                except UnicodeDecodeError:
                    continue
            yield comment

示例#3

0

显示文件

文件： salzburg.py 项目： edisona/amcat.scraping

    def find_start(self, n_articles):
        """Intelligently find the page at which the articles are for the given date, saves hours"""
        jump_distance = n_articles / 4.
        index = n_articles / 2
        offset = int(math.ceil((index) / 10) * 10)
        #find an article with the right date
        while True:

            offset = int(math.ceil(index / 10) * 10)
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]

            if self.options['date'] in dates:
                break
            elif self.options['date'] > dates[0]:
                index -= jump_distance
            elif self.options['date'] < dates[0]:
                index += jump_distance

            if jump_distance < 10: return 0
            jump_distance /= 2.

        #go back to first occurrence
        i = 0
        while self.options['date'] in dates:
            i += 1
            offset -= 10 * i
            if offset < 0: return 0
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]
        return offset

示例#4

0

显示文件

文件： test_article.py 项目： CJStuart/amcat

    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        self.set_up()

        p = amcattest.create_test_project(owner=self.user)
        s = amcattest.create_test_set(project=p)
        a = {
            'date': datetime.datetime.now().isoformat(),
            'headline': 'Test child',
            'medium': 'Fantasy',
            'text': 'Hello Universe',
            'pagenr': 1,
            'url': 'http://example.org',
            'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af',
        }

        url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format(**locals())
        self.post(url, a, self.user)
        amcates.ES().flush()

        res = self.get(url)["results"]
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0]["headline"], a['headline'])
        self.assertEqual(toolkit.readDate(res[0]["date"]), toolkit.readDate(a['date']))
        self.assertEqual(res[0]["uuid"], a['uuid'])

示例#5

0

显示文件

文件： bzk_html.py 项目： larsmans/amcat

 def parse_item(self, item):
     #item: a list of html tags
     article = HTMLDocument()
     for tag in item:
         if tag.tag == "p":
             if hasattr(article.props, 'text'):
                 article.props.text.append(tag)
             else:
                 article.props.text = [tag]
         elif tag.tag == "h2":
             article.props.headline = tag.text
         elif tag.tag == "i":
             bits = tag.text.split()
             if not bits: # empty knipsel
                 return
             if "-" in bits[-1]:
                 try:
                     article.props.date = readDate(bits[-1])
                 except ValueError:
                     article.props.date = None
                 article.props.medium = self.get_medium(" ".join(bits[:-1]))
             elif bits[-1].isdigit():
                 try:
                     article.props.date = readDate(" ".join(bits[-3:]))
                 except ValueError:
                     article.props.date = None
                 article.props.medium = self.get_medium(" ".join(bits[:-3]))
             else:
                 article.props.medium = self.get_medium(" ".join(bits))
                 article.props.date = None
     return article

示例#6

0

显示文件

    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        self.set_up()

        p = amcattest.create_test_project(owner=self.user)
        s = amcattest.create_test_set(project=p)
        a = {
            'date': datetime.datetime.now().isoformat(),
            'headline': 'Test child',
            'medium': 'Fantasy',
            'text': 'Hello Universe',
            'pagenr': 1,
            'url': 'http://example.org',
            'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af',
        }

        url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format(
            **locals())
        self.post(url, a, self.user)
        amcates.ES().flush()

        res = self.get(url)["results"]
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0]["headline"], a['headline'])
        self.assertEqual(toolkit.readDate(res[0]["date"]),
                         toolkit.readDate(a['date']))
        self.assertEqual(res[0]["uuid"], a['uuid'])

示例#7

0

显示文件

    def get_comments(self, page):
        for li in page.doc.cssselect("ul.commentlist li.comment"):
            comment = HTMLDocument()
            comment.parent = page
            try:
                dateauthor = li.cssselect("div.commentsbox")[0].text_content()
            except IndexError:
                comment.props.author = li.text_content().split(":")[0]

                comment.props.date = readDate(":".join(
                    li.text_content().split(":")[1:2]))
                try:
                    comment.props.text = li.cssselect(
                        "div.comment-text-reply")[0]
                except UnicodeDecodeError:
                    continue
            else:
                comment.props.author = dateauthor.split(
                    "Geplaatst door")[1].split(" op ")[0]
                try:
                    li.cssselect("div.commentsbox a")[0].drop_tree()
                except:
                    pass
                comment.props.date = readDate(dateauthor.split(" op ")[1])
                try:
                    comment.props.text = li.cssselect("div.comment-text")[0]
                except UnicodeDecodeError:
                    continue
            yield comment

示例#8

0

显示文件

    def _scrape_unit(self, li):
        a = li.cssselect("li > a")[0]
        article = HTMLDocument(url=urljoin(self.index_url, a.get('href')))
        article.props.headline = a.text
        article.props.kicker = li.cssselect("div.infoboard a.kicker")[0].text
        article.props.intro = li.cssselect("p")
        article.props.date = readDate(
            li.cssselect("div.infoboard span.time")[0].text_content())
        article.prepare(self)
        articletime = article.doc.cssselect("p.articletime")[0].text_content()
        if len(articletime.split("|")) > 2:
            article.props.date = readDate(" ".join(
                articletime.split("|")[:-1]))
            article.props.author = articletime.split("|")[-1]
        else:
            article.props.author = articletime.strip()
            if " Korrespondent" in article.props.author:
                article.props.author = article.props.author.split(
                    "Korrespondent")[1].strip()

        for ad in article.doc.cssselect("div.noprint"):
            ad.drop_tree()
        article.props.text = article.doc.cssselect(
            "p.articlelead, #articletext")
        article.props.section = article.doc.cssselect(
            "div.headtop span.sitetop")[0].text_content()
        yield article

示例#9

0

显示文件

文件： test_article.py 项目： aemal/amcat

    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        a = test_article()
        
        res = self._post_articles(a)
        self.assertEqual(set(res.keys()), {'id'}) # POST should only return IDs

        res = self._get_article(aid=res['id'])
        self.assertEqual(res["headline"], a['headline'])
        self.assertEqual(toolkit.readDate(res["date"]), toolkit.readDate(a['date']))
        self.assertNotIn("text", res.keys())
        self.assertIsNotNone(res["uuid"])
        
        res = self._get_article(aid=res['id'], text=True)
        self.assertEqual(res["text"], a['text'])

        res = self._get_articles()["results"]
        self.assertEqual(len(res), 1)
       

        # can we post explicit UUID?
        self.setUp_set()
        a['uuid'] = str(uuid4())
        self._post_articles(a)
        res = self._get_articles()["results"]
        self.assertEqual(res[0]["uuid"], a['uuid'])

示例#10

0

显示文件

文件： test_serializer.py 项目： CJStuart/amcat

    def test_get(self):
        p1 = amcattest.create_test_project(name="testnaam", description="testdescription", insert_date='2012-01-01')

        actual = self.get(ProjectResource, id=p1.id)

        actual_results = actual.pop("results")
        self.assertEqual(len(actual_results), 1)
        actual_results = actual_results[0]

        date = actual_results.pop('insert_date')
        readDate(date)# check valid date, not much more to check here?

        expected_results={u'insert_user': p1.insert_user.id,
                          u'description': 'testdescription',
                          u'name': u'testnaam',
                          u'guest_role': 11,
                          u'owner': p1.owner.id,
                          u'active': True,
                           u'id': p1.id,
                          u'favourite' : False,
        }

        expected_meta = {
            u'page' : 1,
            u'next' : None,
            u'previous' : None,
            u'per_page' : 10,
            u'total' : 1,
            u'pages' : 1,
            u'echo' : None,
            }

        self.assertDictsEqual(actual, expected_meta)
        self.assertDictsEqual(actual_results, expected_results)

示例#11

0

显示文件

文件： trouw.py 项目： ToonAlfrink/amcatscraping

    def _scrape_unit(self, page): 
        page.prepare(self)
        if page.doc.cssselect("form#_caps_form"):
            return
        header = page.doc.cssselect("div.time_post")[0].text_content()
        pattern = re.compile(r'(Bewerkt door:)?([a-zA-Z0-9 ]+)?(\u2212)?\n((\d{2,2}/){2,2}\d{2,2}), \d{2,2}:\d{2,2}\n(\xa0\u2212\xa0bron: ([A-Za-z0-9 ,]+))?')
        try:
            groups = pattern.search(header).groups()
        except AttributeError: #rare error where regex fails
            page.props.date = readDate(header)
        else:
            page.props.date = readDate(groups[3])
            if groups[0] or (not groups[1]):
                page.props.author = groups[-1]
            elif groups[1]:
                page.props.author = groups[1]

        if not hasattr(page.props,"author") and page.doc.cssselect("span.author"):
            page.props.author = page.doc.cssselect("span.author")[0].text_content()

        if hasattr(page.props,"author"):
            if page.props.author:
                page.props.author = page.props.author[:98]

        page.props.text = page.doc.cssselect("#art_box2 p")
        page.props.html = html.tostring(page.doc)
        try:
            page.props.section = page.doc.cssselect("#subnav_nieuws li span.nieuws")[0].text_content()
        except IndexError:
            if page.doc.cssselect("div.dos_default h2"):
                page.props.section = "dossier: {}".format(page.doc.cssselect("div.dos_default h2")[0].text)
        yield page

示例#12

0

显示文件

文件： lexisnexis.py 项目： larsmans/amcat

def _is_date(string):
    try:
        toolkit.readDate(string)
    except ValueError:
        return False

    return True

示例#13

0

显示文件

文件： salzburg.py 项目： ToonAlfrink/amcatscraping

    def find_start(self, n_articles):
        """Intelligently find the page at which the articles are for the given date, saves hours"""
        jump_distance = n_articles / 4.
        index = n_articles / 2
        offset = int(math.ceil((index) / 10) * 10)
        #find an article with the right date
        while True:

            offset = int(math.ceil(index / 10) * 10)
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]

            if self.options['date'] in dates:
                break
            elif self.options['date'] > dates[0]:
                index -= jump_distance
            elif self.options['date'] < dates[0]:
                index += jump_distance

            if jump_distance < 10: return 0
            jump_distance /= 2.

        #go back to first occurrence
        i = 0
        while self.options['date'] in dates:
            i += 1
            offset -= 10 * i 
            if offset < 0: return 0
            docs = self.getresponse(offset)["docs"]
            dates = [readDate(d["date"]).date() for d in docs]
        return offset

示例#14

0

显示文件

 def test_dates(self):
     """Test whether date deserialization works, see #66"""
     for d in ('2001-01-01', '1992-12-31T23:59', '2012-02-29T12:34:56.789', datetime.datetime.now()):
         a = amcattest.create_test_article(date=d)
         amcates.ES().flush()
         res = self.get("/api/v4/search", ids=a.id)
         self.assertEqual(toolkit.readDate(res['results'][0]['date']), toolkit.readDate(str(d)))

示例#15

0

显示文件

文件： bzk_html.py 项目： pombredanne/amcat

 def parse_dateline(self, text, article):
     bits = text.split()
     if "-" in bits[-1]:
         article.date = readDate(bits[-1])
         article.medium = self.get_medium(" ".join(bits[:-1]))
     elif bits[-1].isdigit() and bits[-3].isdigit():
         article.date = readDate(" ".join(bits[-3:]))
         article.medium = self.get_medium(" ".join(bits[:-3]))
     else:
         article.medium = self.get_medium(" ".join(bits))
         article.date = None
     return article

示例#16

0

显示文件

文件： fd.py 项目： ToonAlfrink/amcatscraping

 def get_comments(self, doc):
     for div in doc.cssselect("#commentsList div.topDivider"):
         comment = HTMLDocument()
         comment.props.text = div.cssselect("div.wordBreak")[0]
         spans = div.cssselect("div.fBold span")
         try:
             comment.props.date = readDate(spans[1].text_content().split(" ")[1])
         except ValueError:
             comment.props.date = readDate(spans[1].text_content())
         comment.props.author = spans[0].text_content().strip()
         comment.props.url = doc.url
         yield comment

示例#17

0

显示文件

文件： bzk_html.py 项目： Institute-Web-Science-and-Technologies/westcat

 def parse_dateline(self, text, article):
     bits = text.split()
     if "-" in bits[-1]:
         article.date = readDate(bits[-1])
         article.medium = self.get_medium(" ".join(bits[:-1]))
     elif bits[-1].isdigit():
         article.date = readDate(" ".join(bits[-3:]))
         article.medium = self.get_medium(" ".join(bits[:-3]))
     else:
         article.medium = self.get_medium(" ".join(bits))
         article.date = None
     return article

示例#18

0

显示文件

文件： fd.py 项目： edisona/amcat.scraping

 def get_comments(self, doc):
     for div in doc.cssselect("#commentsList div.topDivider"):
         comment = HTMLDocument()
         comment.props.text = div.cssselect("div.wordBreak")[0]
         spans = div.cssselect("div.fBold span")
         try:
             comment.props.date = readDate(
                 spans[1].text_content().split(" ")[1])
         except ValueError:
             comment.props.date = readDate(spans[1].text_content())
         comment.props.author = spans[0].text_content().strip()
         yield comment

示例#19

0

显示文件

文件： foodlog.py 项目： ToonAlfrink/amcatscraping

 def _extract(self, doc):
     #get articles from section page. return False if out of date bounds
     for li in doc.cssselect("#content ul li"):
         if "short-news" in doc.url:
             url = li.cssselect("div.text-holder a")[0].get('href')
             date = readDate(self.getdoc(url).cssselect("#content em.date a")[0].text)
         else:
             url = li.cssselect("div.heading a")[0].get('href')
             date = readDate(li.cssselect("em.date a")[0].text)
         if date.date() < self.options['date']:
             yield False
         if date.date() == self.options['date']:
             yield url

示例#20

0

显示文件

文件： nu.py 项目： ToonAlfrink/amcatscraping

 def _get_units(self):
     initial_url = self.search_url.format(p=1)
     initial_doc = self.getdoc(initial_url)
     dates = [
         readDate(article.cssselect("span.date")[0].text).date()
         for article in initial_doc.cssselect("div.subarticle")
     ]
     self.maxdate = max(dates)
     n_results = int(initial_doc.cssselect("#searchlist header h1")[0].text.strip().split(" ")[-1])
     for page in self.pinpoint_pages(n_results):
         for div in page.cssselect("div.subarticle"):
             date = readDate(div.cssselect("span.date")[0].text).date()
             if date == self.options["date"]:
                 url = div.cssselect("h2 a")[0].get("href")
                 yield url

示例#21

0

显示文件

文件： amcates.py 项目： aaltsmienk/amcat

def get_article_dict(art, sets=None):
    date = art.date
    if date:
        if isinstance(art.date, (str, unicode)):
            date = toolkit.readDate(date)
        date = date.isoformat()
    d = dict(
        # dublin core elements
        id = art.id,
        headline=_clean(art.headline),
        text=_clean(art.text),
        date=date,
        creator=_clean(art.author),

        # other elements
        projectid=art.project_id,
        mediumid=art.medium_id,
        medium=art.medium.name,
        byline=_clean(art.byline),
        section=_clean(art.section),
        page=art.pagenr,
        addressee=_clean(art.addressee),
        length=art.length,
        sets = sets
        )

    d['hash'] = _get_hash(d)
    return d

示例#22

0

显示文件

    def _scrape_unit(self, topic_url):
        #navigate to last page, then navigate back until comments are no longer recent
        doc = self.getdoc(topic_url)
        headline = "".join(
            doc.cssselect("title")[0].text_content().split("-")[:-1])
        topic_date = readDate(
            doc.cssselect("span#pt1")[0].text_content().strip())
        try:
            parent = Article.objects.get(headline=headline, date=topic_date)
        except Article.MultipleObjectsReturned:  #duplicate in 99.99% of the cases
            parents = Article.objects.filter(headline=headline,
                                             date=topic_date)
            min_id = min([parent.id for parent in parents
                          ])  #deduplicate usually keeps the lowest id
            parent = parents.get(pk=min_id)
        except Article.DoesNotExist:
            parent = HTMLDocument(url=topic_url)
            parent.props.headline = headline
            parent.props.date = topic_date
            parent.props.text = doc.cssselect("div.postmain_right")[0]
            parent.props.author = doc.cssselect(
                "span.post_sub a.username")[0].text_content().strip()
            parent.props.section = self.current_section

        for post in self.get_posts(doc):
            post.props.parent = parent
            post.props.url = hasattr(
                parent, 'props') and parent.props.url or parent.url
            yield post

        if isinstance(parent, Document):
            yield parent

示例#23

0

显示文件

    def _scrape_unit(self, article_id):
        article = HTMLDocument(url=self.article_url.format(**locals()))
        article.prepare(self)
        article.props.text = article.doc.cssselect("font.artbody")
        if len("".join([t.text_content() for t in article.props.text])) < 100:
            return
        for i, table in enumerate(article.doc.cssselect("table")):
            if table.get('class') == "body":
                table_after_body = article.doc.cssselect("table")[i + 1]
        page_date = re.search(
            "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})",
            table_after_body.text_content())
        article.props.pagenr = page_date.group(1)
        article.props.date = readDate(page_date.group(2))
        article.props.section = self.current_section
        article.props.headline = article.doc.cssselect(
            "td.artheader")[0].text_content().strip()
        if article.doc.cssselect(".artsubheader"):
            article.props.byline = article.doc.cssselect(".artsubheader")[0]
        if article.doc.cssselect("td.artauthor"):
            article.props.author = article.doc.cssselect(
                "td.artauthor")[0].text.split(":")[1].strip()
        dateline_match = re.search(
            "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n",
            "\n".join([n.text_content() for n in article.props.text]).strip())
        if dateline_match:
            article.props.dateline = dateline_match.group(1)

        yield article

示例#24

0

显示文件

文件： nrc.py 项目： ToonAlfrink/amcatscraping

    def _scrape_unit(self, unit): 
        url, section = unit
        if not section:
            section = url.split("/")[3]
        doc = self.getdoc(url)

        try:
            headline = doc.cssselect("#artikel h1")[0].text_content()
        except IndexError:
            return #no headline, no article

        article_dict = {
            'url' : url,
            'text' : doc.cssselect("#broodtekst")[0],
            'headline' : headline,
            'section' : section,
            'author' : doc.cssselect("div.author") and doc.cssselect("div.author a")[0].text or None,
            'date' : readDate(doc.cssselect("#midden time")[0].get('datetime')),
            'children' : []
            }

        article = HTMLDocument(**article_dict)
        article.props.html = html.tostring(doc)
        yield article
        
        for c in self.get_comments(article):
            c.is_comment = True
            c.parent = article
            yield c

示例#25

0

显示文件

文件： bzk_html.py 项目： kasperwelbers/amcat

    def scrape_file(self, _html, t):
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = HTMLDocument()
            article.props.html = div
            article.props.headline = div.cssselect(
                "#articleTitle")[0].text_content()
            article.props.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.props.pagenr, article.props.section = self.get_pagenum(
                    articlepage[0].text)

            if not div.cssselect("#sourceTitle")[0].text:
                article.props.medium = Medium.get_or_create("unknown medium")
            else:
                article.props.medium = Medium.get_or_create(
                    div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.props.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article

示例#26

0

显示文件

    def _get_units(self):
        for section in self.sections:
            page = 1
            url = self.page_url.format(**locals())

            date = _date.today()
            ipage = self.getdoc(url)
            while date >= self.options['date']:
                if not ipage.cssselect("#main ul.snelnieuws_list li.item"):
                    print("\nNo articles found as far back as given date\n")
                    break
                for unit in ipage.cssselect(
                        '#main ul.snelnieuws_list li.item'):
                    href = unit.cssselect('a')[0].get('href')
                    article = HTMLDocument(url=href)
                    article.prepare(self)
                    try:
                        date = readDate(
                            article.doc.cssselect("span.datum")
                            [0].text).date()
                    except IndexError:
                        continue
                    if date == self.options['date']:
                        yield article
                    elif date < self.options['date']:
                        break

                page += 1
                nxt_url = self.page_url.format(**locals())
                ipage = self.getdoc(nxt_url)

示例#27

0

显示文件

    def _get_units(self):
        self.open("http://www.powned.tv")
        self.open("http://cookies.publiekeomroep.nl/accept/")
        d = self.options['date']
        docs = []
        for x in range(d.day - 7, d.day + 7):
            archive_url = ARCHIVE_URL.format(**locals())
            try:
                doc = self.getdoc(archive_url)
            except HTTPError:
                pass
            else:
                docs.append(doc)

        entries = set([])
        for doc in docs:
            for li in doc.cssselect("ul.articlelist li"):

                _date = readDate(
                    " ".join(li.cssselect("span.t")[0].text.split()[:2]) +
                    " " + str(self.options['date'].year)).date()
                url = urljoin(archive_url, li.cssselect("a")[0].get('href'))
                entries.add((_date, url))

        for _date, url in entries:

            if _date == self.options['date']:
                article = HTMLDocument(date=_date, url=url)
                yield article

示例#28

0

显示文件

文件： bzk_html.py 项目： pombredanne/amcat

    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = Article(metastring={})
            article.metastring['html'] = div
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.pagenr, article.section = self.get_pagenum(
                    articlepage[0].text)

            article.medium = self.get_medium(
                div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article

示例#29

0

显示文件

文件： forum_fok.py 项目： edisona/amcat.scraping

 def get_article(self, page):
     span = page.doc.cssselect("#detail_content span.author")[0]
     page.props.date = readDate(tostring(span).split("<br/>")[1])
     try:
         page.props.author = span.cssselect("a")[0].text
     except IndexError:
         try:
             page.props.author = tostring(span).split("<br/>")[0].split(
                 "oor:")[1].strip()[0:98]
         except IndexError:
             page.props.author = "unknown"
     try:
         page.props.source = tostring(span).split("<br/>")[1].split(
             "bron:")[1]
     except IndexError:
         pass
     page.props.headline = page.doc.cssselect("h1")[0].text
     try:
         page.props.text = [
             page.doc.cssselect("#detail_content p.intro")[0],
             page.doc.cssselect("section.clear")[0]
         ]
     except IndexError:
         page.props.text = page.doc.cssselect("#detail_content")[0]
     return page

示例#30

0

显示文件

    def _scrape_unit(self, page):

        page.prepare(self)
        page.doc = self.getdoc(page.props.url)
        author = page.doc.cssselect("div.nieuws_box p")[2]
        for script in author.cssselect("script"):
            script.drop_tree()
        try:
            page.props.author = author.cssselect("a")[0].text
        except IndexError:
            page.props.author = author.text_content().split(":")[1].strip()
        if len(page.props.author) >= 99:
            page.props.author = "author protected"

        page.props.headline = page.doc.cssselect(
            "#container_content div.content h2")[0].text
        page.props.text = page.doc.cssselect("div.nieuws_tekst")[0]
        info = page.doc.cssselect("div.nieuws_box p")
        for p in info:
            if "Plaatsingsdatum" in p.cssselect("b")[0].text:
                page.props.date = readDate(p.text_content().split(":")[1])
                break

        for comment in self.scrape_comments(page):
            comment.is_comment = True
            yield comment

        yield page

示例#31

0

显示文件

文件： steamcommunity.py 项目： ToonAlfrink/amcatscraping

    def scrape_media(self,doc,_type):
        scrn = HTMLDocument()
        scrn.doc = doc
        try:
            scrn.props.text = scrn.doc.cssselect("div.mediaDescription")[0]
        except IndexError:
            scrn.props.text = "none"

        try:
            scrn.props.headline = "{} {}".format(scrn.doc.cssselect("div.screenshotAppName")[0].text,_type)
        except IndexError:
            scrn.props.headline = "unknown"

        author_url = "/".join(scrn.doc.cssselect("div.linkAuthor a")[0].get('href').split("/")[:-2])
        scrn = self.get_author_props(scrn, author_url)

        for obj in scrn.doc.cssselect("div.rightDetailsBlock div.detailsStatRight"):
            try:
                scrn.props.date = readDate(obj.text)
            except ValueError:
                continue
            else:
                break

        if not scrn.doc.cssselect("div.commentthread_paging"):
            yield scrn;return
        if not scrn.doc.cssselect("div.commentthread_header div.commentthread_paging span")[1].text_content():
            for comment in self.scrape_comments(scrn):
                yield comment
        else:
            raise NotImplementedError

        yield scrn

示例#32

0

显示文件

文件： gmx.py 项目： ToonAlfrink/amcatscraping

 def _scrape_unit(self, article):
     article.prepare(self)
     article.props.date = readDate(article.doc.cssselect("#datetime")[0].text_content())
     article.props.section = " > ".join(article.props.url.split("/")[4:-1])
     article.props.headline = article.doc.cssselect("#headline")[0].text_content().strip()
     article.props.text = article.doc.cssselect("#teaser") + article.doc.cssselect("#main > p")
     yield article

示例#33

0

显示文件

文件： bzk_html.py 项目： BBie/amcat

    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
        else:
            raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.")

        for div in divs:
            article = Article(metastring=div.text_content())
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0].text_content()
            articlepage = div.cssselect("#articlePage")

            if articlepage:
                article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content())

            article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content())
            date_str = div.cssselect("#articleDate")[0].text_content()

            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article

示例#34

0

显示文件

文件： fd.py 项目： ToonAlfrink/amcatscraping

 def _scrape_unit(self, article_id):
     article = HTMLDocument(url = self.article_url.format(**locals()))
     article.prepare(self)
     article.props.text = article.doc.cssselect("font.artbody")
     if len("".join([t.text_content() for t in article.props.text])) < 100:
         return
     for i, table in enumerate(article.doc.cssselect("table")):
         if table.get('class') == "body":
             table_after_body = article.doc.cssselect("table")[i + 1]
     page_date = re.search(
         "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})",
         table_after_body.text_content())
     article.props.pagenr = page_date.group(1)
     article.props.date = readDate(page_date.group(2))
     article.props.section = self.current_section
     article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip()
     if article.doc.cssselect(".artsubheader"):
         article.props.byline = article.doc.cssselect(".artsubheader")[0]
     if article.doc.cssselect("td.artauthor"):
         article.props.author = article.doc.cssselect("td.artauthor")[0].text.split(":")[1].strip()
     dateline_match = re.search(
         "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n",
         "\n".join([n.text_content() for n in article.props.text]).strip())
     if dateline_match:
         article.props.dateline = dateline_match.group(1)
                                       
     yield article

示例#35

0

显示文件

文件： uitzendinggemist-embed.py 项目： ToonAlfrink/amcatscraping

    def get(self, page):
        medium = unicode(getm(page.props.mid)).lower()
        headline = page.props.headline

        date = toolkit.readDate(headline).strftime('%a %d %b %Y')
        sid = NAME_SERIES_MAP[medium]

        url = SEARCH_URL % (sid, urllib.quote(date))

        try:
            episode = EPISODE_RE.search(self.getdoc(url, lxml=False)).groups(1)[0]
        except:
            #print(url)
            return []

        page.props.episode_url = urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode)
        
        url = OEMBED_URL % urlparse.urljoin(SEARCH_URL, '/afleveringen/%s' % episode)
        page.props.embed_url = url

        #print(self.getdoc(url, lxml=False))

        page.props.embed_flash = OBJECT_RE.search(self.getdoc(url, lxml=False)).groups()[0]

        del page.props.headline
        del page.props.mid

        return [page,]

示例#36

0

显示文件

def filters_from_form(form_data):
    if form_data.get('datetype') == 'on':
        d = readDate(form_data.get('on_date'))
        yield 'start_date', d.isoformat()
        yield 'end_date', (d + relativedelta(days=1)).isoformat()
    elif form_data.get('datetype') == 'between':
        yield 'start_date', form_data.get('start_date')
        yield 'end_date', form_data.get('end_date')
    elif form_data.get('datetype') == 'after':
        yield 'start_date', form_data.get('start_date')
    elif form_data.get('datetype') == 'before':
        yield 'end_date', form_data.get('end_date')
        
    
    for k in form_data.keys():
        if  k in FILTER_FIELDS:
            try:
                vals = form_data.getlist(k)
            except AttributeError:
                vals = form_data[k]
                # make sure vals is a list
                if isinstance(vals, (str, unicode)) or not isinstance(vals, collections.Iterable):
                    vals = [vals]
            vals = [_serialize(v) for v in vals if v]
            if vals:
                yield FILTER_FIELDS[k], vals
                
    if 'articlesets' not in form_data:
        # filter on all sets in project
        p = Project.objects.get(pk=form_data['projects'])
        sets = [s.id for s in p.all_articlesets()]
        yield "sets", sets

示例#37

0

显示文件

文件： bzk_html.py 项目： BBie/amcat

    def scrape_2(self, _html):
        """New format as of 2014 and a few days before"""
        title = _html.cssselect("h1")[0]
        if not title.text:
            title = title.cssselect("span")[0]
        docdate = readDate(title.text.split("-")[1])

        # split body by <hr>
        items = []
        item = []
        
        if len(_html.cssselect("body > hr")) == 0:
            # select MS Word div wrapper
            tags = _html.cssselect("body > div.WordSection1 > *")
            if len(tags) == 0:
                    raise ParseError("Document format is not supported")

        else:
            tags = _html.cssselect("body > *")

        for child in tags:
            if child.tag == "hr" or (child.tag == "div" and child.cssselect("span > hr")):
                items.append(item)
                item = []
            else:
                item.append(child)

        # first item is the index
        items = items[1:]
        for item in items:
            article = self.parse_item(item)
            if not article.date:
                article.date = docdate
            yield article

示例#38

0

显示文件

文件： bzk_html.py 项目： pombredanne/amcat

    def scrape_2(self, _html):
        """New format as of 2014 and a few days before"""
        docdate = readDate(_html.cssselect("h1")[0].text.split("-")[1])

        #split body by <hr>
        items = []
        item = []
        if len(_html.cssselect("body > *")) == 1:
            tags = _html.cssselect(
                "body > div > *")  #extra div wrapper as of 2014-04-08
        else:
            tags = _html.cssselect("body > *")

        for child in tags:
            if child.tag == "hr":
                items.append(item)
                item = []
            else:
                item.append(child)

        #first item is the index
        items = items[1:]
        for item in items:
            article = self.parse_item(item)
            if not article.date:
                article.date = docdate
            yield article

示例#39

0

显示文件

文件： telegraaf.py 项目： ToonAlfrink/amcatscraping

    def _get_units(self):
        for section in self.sections:
            page = 1
            url = self.page_url.format(**locals())

            date = _date.today()
            ipage = self.getdoc(url)
            while date >= self.options['date']:
                if not ipage.cssselect("#main ul.snelnieuws_list li.item"):
                    print("\nNo articles found as far back as given date\n")
                    break
                for unit in ipage.cssselect('#main ul.snelnieuws_list li.item'):
                    href = unit.cssselect('a')[0].get('href')
                    article = HTMLDocument(url=href)
                    article.prepare(self)
                    try:
                        date = readDate(article.doc.cssselect("span.datum")[0].text).date()
                    except IndexError:
                        continue
                    if date == self.options['date']: 
                        yield article
                    elif date < self.options['date']:
                        break 

                page += 1
                nxt_url = self.page_url.format(**locals())
                ipage = self.getdoc(nxt_url)

示例#40

0

显示文件

文件： dekrantvantoen.py 项目： edisona/amcat.scraping

 def _get_units(self):
     for page in self.search_result_pages():
         n = 0
         for table in page.cssselect("#containerContent table"):
             try:
                 onclick = table.cssselect("td.result a")[0].get('onclick')
             except IndexError:
                 continue
             article_id = onclick.split("('")[1].split("',")[0]
             try:
                 right_td = [
                     td for td in table.cssselect("td")
                     if td.get('align') == 'right'
                 ][0]
                 date = readDate(right_td.text_content())
             except IndexError:
                 continue
             n += 1
             footer = table.cssselect("span i nobr")[0].text_content()
             pagenr_section_pattern = re.compile(
                 "\({self.paper_full_name} +([a-zA-Z ]+) +, blz ([0-9]+)\)".
                 format(**locals()))
             section, pagenr = pagenr_section_pattern.search(
                 footer).groups()
             headline = table.cssselect(
                 "td.result a")[0].text_content().strip()
             yield (headline, date, pagenr, section.strip(),
                    self.pdf_url.format(**locals()))
         if n == 0:
             break

示例#41

0

显示文件

文件： test_toolkit.py 项目： Stolpovskaya/amcat

 def test_readdate(self):
     for s, date, american, lax in (
         ("22 maart 1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("22 mrt 1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("22/3/1980" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("1980-3-22" , datetime.datetime(1980, 3, 22,0,0,0), False, True),
         ("1980-3-22T01:00:05" , datetime.datetime(1980, 3, 22,1,0,5), False, True),
         ("1980-3-22 01:00" , datetime.datetime(1980, 3, 22,1,0,0), False, True),
         ("1980-3-22 01:00 PM" , datetime.datetime(1980, 3, 22,13,0,0), False, True),
         ("1980-3-22 01:00:00:00" , datetime.datetime(1980, 3, 22,0,0,0), False, True), #time->0
         ("1980-13-22 01:00:00:00" , None, False, True), # illegal date --> None
         ("1980-13-22 01:00:00" , ValueError, False, False), # illegal date --> Error
         ("1980-3-22 27:00:00" , ValueError, False, False), # illegal time --> Error
         ("1980-3-22 23:00:00:00" , ValueError, False, False), # illegal time --> Error
         ("Sun Sep 29 18:21:12 +0000 2013", datetime.datetime(2013,9,29,18,21,12), False, False), # twitter (??)
         ("1/1/98", datetime.datetime(1998, 1, 1,0,0,0), False, True),
         ("1/1/04", datetime.datetime(2004, 1, 1,0,0,0), False, True),
         ("31/12/72", datetime.datetime(1972, 12, 31,0,0,0), False, True),
         ("12/31/72", datetime.datetime(1972, 12, 31,0,0,0), True, True),
         ("1/2/1972", datetime.datetime(1972, 2, 1,0,0,0), False, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2,0,0,0), True, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2,0,0,0), True, True),
         ("30.09.2008", datetime.datetime(2008, 9, 30,0,0,0), False, False),
         ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0), False, True),
         ("December 31, 2009 Thursday", datetime.datetime(2009, 12, 31, 0, 0, 0), False, False),
         (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0), False, False),
         ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0), False, False),
         ('September 1, 2008 Monday 12:44 PM AEST', datetime.datetime(2008, 9, 1, 12, 44), False, False),
         ):
         if inspect.isclass(date) and issubclass(date, Exception):
             self.assertRaises(date, toolkit.readDate, s, lax=False, american=american)
         else:
             date2 = toolkit.readDate(s, lax=lax, american=american)
             self.assertEqual(date2, date)

示例#42

0

显示文件

文件： uitzendinggemist-embed.py 项目： edisona/amcat.scraping

    def get(self, page):
        medium = unicode(getm(page.props.mid)).lower()
        headline = page.props.headline

        date = toolkit.readDate(headline).strftime('%a %d %b %Y')
        sid = NAME_SERIES_MAP[medium]

        url = SEARCH_URL % (sid, urllib.quote(date))

        try:
            episode = EPISODE_RE.search(self.getdoc(url,
                                                    lxml=False)).groups(1)[0]
        except:
            #print(url)
            return []

        page.props.episode_url = urlparse.urljoin(SEARCH_URL,
                                                  '/afleveringen/%s' % episode)

        url = OEMBED_URL % urlparse.urljoin(SEARCH_URL,
                                            '/afleveringen/%s' % episode)
        page.props.embed_url = url

        #print(self.getdoc(url, lxml=False))

        page.props.embed_flash = OBJECT_RE.search(self.getdoc(
            url, lxml=False)).groups()[0]

        del page.props.headline
        del page.props.mid

        return [
            page,
        ]

示例#43

0

显示文件

    def _get_units(self):
        """
        PhpBB forum scraper
        """
        index = self.getdoc(self.index_url)

        for cat_title, cat_doc in self.get_categories(index):
            for page in self.get_pages(cat_doc):
                for fbg in page.cssselect('.forumbg'):
                    for li in fbg.cssselect('.topics > li'):
                        url = urljoin(
                            self.index_url,
                            li.cssselect("a.topictitle")[0].get('href'))
                        _date = etree.tostring(
                            li.cssselect("dd.lastpost")[0]).split("br />")[1]
                        date = toolkit.readDate(_date)
                        yield {
                            'date':
                            date,
                            'object':
                            HTMLDocument(
                                headline=li.cssselect("a.topictitle")[0].text,
                                url=url,
                                category=cat_title)
                        }

示例#44

0

显示文件

    def _scrape_unit(self, thread):
        fipo = True  # First post
        thread.doc = self.getdoc(thread.props.url)
        for page in self.get_pages(thread.doc):
            for post in page.cssselect('.post'):
                ca = thread if fipo else thread.copy(parent=thread)
                ca.props.date = toolkit.readDate(
                    post.cssselect('.author')[0].text_content()[-22:])
                ca.props.text = post.cssselect('.content')

                title = post.cssselect('.postbody h3 a')[0].text
                if fipo:
                    optitle = title
                if title:
                    ca.props.headline = title
                else:
                    ca.props.headline = 're: {}'.format(optitle)

                try:
                    ca.props.author = post.cssselect(
                        '.author strong')[0].text_content()
                except:
                    try:
                        ca.props.author = post.cssselect(
                            '.author a')[0].text_content()
                    except:
                        # Least reliable method
                        ca.props.author = post.cssselect(
                            '.author')[0].text_content().split()[0]

                yield ca

                fipo = False

示例#45

0

显示文件

文件： gezondheid_blog.py 项目： ToonAlfrink/amcatscraping

 def get_article(self, page):
     postinfo = page.doc.cssselect("div.postInfo")[0].text
     page.props.date = readDate(postinfo.split(" op ")[1].split(",")[0])
     page.props.headline = page.doc.cssselect("div.postInner h1")[0].text_content()
     page.props.text = page.doc.cssselect("div.postEntry")[0]
     page.props.author = postinfo.split(" op ")[0].split("Door")[1]
     return page

示例#46

0

显示文件

文件： keywordsearch.py 项目： larsmans/amcat

def filters_from_form(form_data):
    if form_data.get('datetype') == 'on':
        d = readDate(form_data.get('on_date'))
        yield 'start_date', d.isoformat()
        yield 'end_date', (d + relativedelta(days=2)).isoformat()
    elif form_data.get('datetype') == 'between':
        yield 'start_date', form_data.get('start_date')
        yield 'end_date', form_data.get('end_date')
    elif form_data.get('datetype') == 'after':
        yield 'start_date', form_data.get('start_date')
    elif form_data.get('datetype') == 'before':
        yield 'end_date', form_data.get('end_date')
        
    
    for k in form_data.keys():
        if  k in FILTER_FIELDS:
            try:
                vals = form_data.getlist(k)
            except AttributeError:
                vals = form_data[k]
                # make sure vals is a list
                if isinstance(vals, (str, unicode)) or not isinstance(vals, collections.Iterable):
                    vals = [vals]
            vals = [_serialize(v) for v in vals if v]
            if vals:
                yield FILTER_FIELDS[k], vals
                
    if 'articlesets' not in form_data:
        # filter on all sets in project
        p = Project.objects.get(pk=form_data['projects'])
        sets = [s.id for s in p.all_articlesets()]
        yield "sets", sets

示例#47

0

显示文件

文件： phpbbscraper.py 项目： kasperwelbers/amcat

    def _scrape_unit(self, thread):
        thread = thread['object']
        fipo = True # First post
        thread.doc = self.getdoc(thread.props.url)
        for page in self.get_pages(thread.doc):
            for post in page.cssselect('.post'):
                ca = thread if fipo else thread.copy(parent=thread)
                ca.props.date = toolkit.readDate(post.cssselect('.author')[0].text_content()[-22:])
                ca.props.text = post.cssselect('.content')
                
                title = unicode(post.cssselect('.postbody h3 a')[0].text)
                
                if fipo and title:
                    optitle = title
                elif fipo:
                    raise Exception("No op title found")
                if title:
                    ca.props.headline = title
                else:
                    ca.props.headline = 'Re: {}'.format( optitle )

                try:
                    ca.props.author = unicode(post.cssselect('.author strong')[0].text_content())
                except:
                    try:
                        ca.props.author = unicode(post.cssselect('.author a')[0].text_content())
                    except:
                        # Least reliable method
                        ca.props.author = unicode(post.cssselect('.author')[0].text_content().split()[0])

                yield ca

                fipo = False

示例#48

0

显示文件

文件： nieuws_rijksoverheid.py 项目： edisona/amcat.scraping

    def _scrape_unit(self, url):
        article = HTMLDocument(url=url)
        article.prepare(self)

        content = article.doc.cssselect("#content-column")[0]
        article.props.date = readDate(content.cssselect("p.article-meta")[0].text.split("|")[1])
        article.props.headline = content.cssselect("h1")[0].text
        
        for x in [
            content.cssselect("h1")[0],
            content.cssselect("p.article-meta")[0],
            content.cssselect("p.sharing")[0]
        ]:
            x.drop_tree()

        article.props.text = content.text_content()

        for block in article.doc.cssselect("#aside-column div.block"):
            title = block.cssselect("h2")[0].text
            if "Verantwoordelijk" in title and "ministerie" in title:
                article.props.author = "; ".join([a.text for a in block.cssselect("ul.list-common li a")])
                break
        
        try:
            if len(article.props.author) > 100:
                article.props.author = article.props.author[:100]
        except AttributeError:
            pass
        yield article

示例#49

0

显示文件

文件： amcates.py 项目： kasperwelbers/amcat

def get_article_dict(art, sets=None):
    date = art.date
    if isinstance(art.date, (str, unicode)):
        date = toolkit.readDate(date)
    date = date.isoformat()

    d = dict(
        # dublin core elements
        id=art.id,
        headline=_clean(art.headline),
        text=_clean(art.text),
        date=date,
        creator=_clean(art.author),

        # other elements
        projectid=art.project_id,
        mediumid=art.medium_id,
        medium=art.medium.name,
        byline=_clean(art.byline),
        section=_clean(art.section),
        page=art.pagenr,
        addressee=_clean(art.addressee),
        length=art.length,
        sets=sets)

    d['hash'] = _get_hash(d)
    return d

示例#50

0

显示文件

文件： bzk_html.py 项目： kasperwelbers/amcat

    def scrape_file(self, _html, t):
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
            
        for div in divs:
            article = HTMLDocument()
            article.props.html = div
            article.props.headline = div.cssselect("#articleTitle")[0].text_content()
            article.props.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.props.pagenr, article.props.section = self.get_pagenum(articlepage[0].text)

            if not div.cssselect("#sourceTitle")[0].text:
                article.props.medium = Medium.get_or_create("unknown medium")
            else:
                article.props.medium = Medium.get_or_create(div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.props.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article

示例#51

0

显示文件

文件： bzk_html.py 项目： Institute-Web-Science-and-Technologies/westcat

    def scrape_2(self, _html):
        """New format as of 2014 and a few days before"""

        docdate = readDate(_html.cssselect("h1")[0].text.split("-")[1])

        #split body by <hr>
        items = []
        item = []
        tags = set()
        for child in _html.cssselect("body > *"):
            tags.add(child.tag)
            if child.tag == "hr":
                items.append(item)
                item = []
            else:
                item.append(child)

        #first item is the index
        items = items[1:]

        for item in items:
            article = self.parse_item(item)
            if not article.date:
                article.date = docdate
            yield article

示例#52

0

显示文件

文件： nieuws_zorgportaal.py 项目： ToonAlfrink/amcatscraping

 def scrape_comments(self,page):
     for li in page.doc.cssselect("ul.uiList li.fbFeedbackPost"):
         comment = HTMLDocument(parent=page,url=page.url)
         comment.props.text = li.cssselect("div.postText")[0].text
         comment.props.author = li.cssselect("a.profileName")[0].text
         comment.props.date = readDate(li.cssselect("abbr.timestamp")[0].get('title'))
         yield comment

示例#53

0

显示文件

文件： nieuws_zorgportaal.py 项目： ToonAlfrink/amcatscraping

    def _scrape_unit(self, page): 

        page.prepare(self)
        page.doc = self.getdoc(page.props.url)
        author = page.doc.cssselect("div.nieuws_box p")[2]
        for script in author.cssselect("script"):
            script.drop_tree()
        try:
            page.props.author = author.cssselect("a")[0].text
        except IndexError:
            page.props.author = author.text_content().split(":")[1].strip()
        if len(page.props.author) >=99:
            page.props.author="author protected"
        
        page.props.headline = page.doc.cssselect("#container_content div.content h2")[0].text
        page.props.text = page.doc.cssselect("div.nieuws_tekst")[0]
        info = page.doc.cssselect("div.nieuws_box p")
        for p in info:
            if "Plaatsingsdatum" in p.cssselect("b")[0].text:
                page.props.date = readDate(p.text_content().split(":")[1])
                break

            
        for comment in self.scrape_comments(page):
            comment.is_comment = True
            yield comment

        yield page

示例#54

0

显示文件

文件： nujij.py 项目： edisona/amcat.scraping

 def scrape_comments(self, page):
     nxt = page.doc
     if len(nxt.cssselect("div.pages a.next")) >= 1:
         while len(nxt.cssselect("div.pages a.next")) >= 1:
             try:
                 nxt = self.getdoc(
                     nxt.cssselect("div.pages a.next")[0].get('href'))
             except ValueError:
                 nxt = self.getdoc(
                     urljoin(
                         INDEX_URL,
                         nxt.cssselect("div.pages a.next")[0].get('href')))
             for li in nxt.cssselect("ol.reacties li.hidenum"):
                 comment = HTMLDocument(parent=page)
                 if not ("<b>Reageer als eerste op dit bericht</b>"
                         in etree.tostring(li)
                         or "gebruiker verwijderd" in etree.tostring(li)):
                     try:
                         comment.props.text = li.cssselect(
                             "div.reactie-body")[0]
                         comment.props.author = li.cssselect(
                             "strong")[0].text
                         comment.props.date = readDate(
                             li.cssselect("span.tijdsverschil")[0].get(
                                 'publicationdate'))
                     except IndexError:
                         pass
                     else:
                         if comment.props.date.date(
                         ) == self.options['date']:
                             yield comment
     else:
         for li in nxt.cssselect("ol.reacties li.hidenum"):
             comment = HTMLDocument(parent=page)
             if not "<b>Reageer als eerste op dit bericht</b>" in etree.tostring(
                     li):
                 try:
                     comment.props.text = li.cssselect(
                         "div.reactie-body")[0]
                     comment.props.author = li.cssselect("strong")[0].text
                     comment.props.date = readDate(
                         li.cssselect("span.tijdsverschil")[0].get(
                             'publicationdate'))
                     if comment.props.date.date() == self.options['date']:
                         yield comment
                 except IndexError:
                     pass

示例#55

0

显示文件

 def test_readdate(self):
     for s, date, american, lax in (
         ("22 maart 1980", datetime.datetime(1980, 3, 22, 0, 0,
                                             0), False, True),
         ("22 mrt 1980", datetime.datetime(1980, 3, 22, 0, 0,
                                           0), False, True),
         ("22/3/1980", datetime.datetime(1980, 3, 22, 0, 0,
                                         0), False, True),
         ("1980-3-22", datetime.datetime(1980, 3, 22, 0, 0,
                                         0), False, True),
         ("1980-3-22T01:00:05", datetime.datetime(1980, 3, 22, 1, 0,
                                                  5), False, True),
         ("1980-3-22 01:00", datetime.datetime(1980, 3, 22, 1, 0,
                                               0), False, True),
         ("1980-3-22 01:00 PM", datetime.datetime(1980, 3, 22, 13, 0,
                                                  0), False, True),
         ("1980-3-22 01:00:00:00", datetime.datetime(1980, 3, 22, 0, 0, 0),
          False, True),  #time->0
         ("1980-13-22 01:00:00:00", None, False,
          True),  # illegal date --> None
         ("1980-13-22 01:00:00", ValueError, False,
          False),  # illegal date --> Error
         ("1980-3-22 27:00:00", ValueError, False,
          False),  # illegal time --> Error
         ("1980-3-22 23:00:00:00", ValueError, False,
          False),  # illegal time --> Error
         ("Sun Sep 29 18:21:12 +0000 2013",
          datetime.datetime(2013, 9, 29, 18, 21,
                            12), False, False),  # twitter (??)
         ("1/1/98", datetime.datetime(1998, 1, 1, 0, 0, 0), False, True),
         ("1/1/04", datetime.datetime(2004, 1, 1, 0, 0, 0), False, True),
         ("31/12/72", datetime.datetime(1972, 12, 31, 0, 0,
                                        0), False, True),
         ("12/31/72", datetime.datetime(1972, 12, 31, 0, 0, 0), True, True),
         ("1/2/1972", datetime.datetime(1972, 2, 1, 0, 0, 0), False, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2, 0, 0, 0), True, True),
         ("1/2/1972", datetime.datetime(1972, 1, 2, 0, 0, 0), True, True),
         ("30.09.2008", datetime.datetime(2008, 9, 30, 0, 0,
                                          0), False, False),
         ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0,
                                               0), False, True),
         ("December 31, 2009 Thursday",
          datetime.datetime(2009, 12, 31, 0, 0, 0), False, False),
         (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0,
                                                0), False, False),
         ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0,
                                              0), False, False),
         ('September 1, 2008 Monday 12:44 PM AEST',
          datetime.datetime(2008, 9, 1, 12, 44), False, False),
     ):
         if inspect.isclass(date) and issubclass(date, Exception):
             self.assertRaises(date,
                               toolkit.readDate,
                               s,
                               lax=False,
                               american=american)
         else:
             date2 = toolkit.readDate(s, lax=lax, american=american)
             self.assertEqual(date2, date)

示例#56

0

显示文件

 def _get_units(self):
     for page in self.get_pages():
         for post in page.cssselect("div.post"):
             article = HTMLDocument(
                 url = post.cssselect("span.h3 a")[0].get('href'),
                 headline = post.cssselect("span.h3")[0].text_content().strip(),
                 date = readDate(post.cssselect("span.comments span")[0].text.replace(".","-").split(" ")[1]))
             yield article

示例#57

0

显示文件

 def get_comments(self, page):
     for li in page.doc.cssselect("#detail_reactions #reaction ul.clear li"):
         comment = HTMLDocument()
         comment.props.author = li.cssselect("cite")[0].text.strip()
         comment.props.text = li.cssselect("blockquote")[0]
         comment.props.date = readDate(li.cssselect("span.time")[0].text)
         comment.parent = page
         yield comment

示例#58

0

显示文件

文件： geenstijl.py 项目： edisona/amcat.scraping

 def get_comments(self,page):
     for article in page.doc.cssselect("#comments article"):
         comment = HTMLDocument(parent = page)
         footer = article.cssselect("footer")[0].text_content().split(" | ")
         comment.props.date = readDate(footer[1])
         comment.props.author = footer[0]
         comment.props.text = article.cssselect("p")
         yield comment

示例#59

0

显示文件

 def get_article(self, page):
     page.props.date = readDate(page.doc.cssselect("#pt1")[0].text_content())
     page.props.author = page.doc.cssselect("span.post_sub a.username")[0].text
     page.props.headline = page.doc.cssselect("div.fieldholder h1")[0].text_content()
     
     page.props.text = page.doc.cssselect("div.postmain_right")[0]
     page.coords=''
     return page