Пример #1
0
    def _scrape_unit(self, result):

        # (i)page properties:
        # bytes, page, category, any of the article props

        # article properties:
        # date, section, pagenr, headline, byline, length (autogenerated),
        # url (already present), text, parent, medium (auto), author

        obj = Document()
        obj.doc = result.data
        obj.props.type = result.type
        obj.props.fb_id = obj.doc['id']

        _type = result.type
        if _type == "post":
            for post in self.scrape_post(obj):
                yield post
        elif _type == "page":
            for unit in self.scrape_page(obj):
                yield unit
Пример #2
0
    def _get_paper(self, paper_id):
        date = self.options['date']

        rmsg = self.create_message(
            messaging.RemotingMessage,
            operation="getPaper",
            body=[self.paper_id, paper_id, self.context_id],
            destination="onlineFacade")

        env = self.create_envelope(self.create_request(rmsg))
        resp = self.apiget(env).bodies[0][1]

        for spread in resp.body.body['spreads']:
            for page in [spread.get(p) for p in ('leftPage', 'rightPage')]:
                if page is None: continue
                index = Document()
                index.props.date = date
                index.props.section = page.get('section')
                index.props.pagenr = index.page = page.get('nr')
                index.doc = page

                yield index
Пример #3
0
    def _get_paper(self, paper_id):
        date = self.options['date']

        rmsg = self.create_message(
            messaging.RemotingMessage,
            operation="getPaper",
            body=[self.paper_id, paper_id, self.context_id],
            destination="onlineFacade"
        )

        env = self.create_envelope(self.create_request(rmsg))
        resp = self.apiget(env).bodies[0][1]

        for spread in resp.body.body['spreads']:
            for page in [spread.get(p) for p in ('leftPage', 'rightPage')]:
                if page is None: continue
                index = Document()
                index.props.date = date
                index.props.section = page.get('section')
                index.props.pagenr = index.page = page.get('nr')
                index.doc = page

                yield index