def _scrape_unit(self, result): # (i)page properties: # bytes, page, category, any of the article props # article properties: # date, section, pagenr, headline, byline, length (autogenerated), # url (already present), text, parent, medium (auto), author obj = Document() obj.doc = result.data obj.props.type = result.type obj.props.fb_id = obj.doc['id'] _type = result.type if _type == "post": for post in self.scrape_post(obj): yield post elif _type == "page": for unit in self.scrape_page(obj): yield unit
def _get_paper(self, paper_id): date = self.options['date'] rmsg = self.create_message( messaging.RemotingMessage, operation="getPaper", body=[self.paper_id, paper_id, self.context_id], destination="onlineFacade") env = self.create_envelope(self.create_request(rmsg)) resp = self.apiget(env).bodies[0][1] for spread in resp.body.body['spreads']: for page in [spread.get(p) for p in ('leftPage', 'rightPage')]: if page is None: continue index = Document() index.props.date = date index.props.section = page.get('section') index.props.pagenr = index.page = page.get('nr') index.doc = page yield index
def _get_paper(self, paper_id): date = self.options['date'] rmsg = self.create_message( messaging.RemotingMessage, operation="getPaper", body=[self.paper_id, paper_id, self.context_id], destination="onlineFacade" ) env = self.create_envelope(self.create_request(rmsg)) resp = self.apiget(env).bodies[0][1] for spread in resp.body.body['spreads']: for page in [spread.get(p) for p in ('leftPage', 'rightPage')]: if page is None: continue index = Document() index.props.date = date index.props.section = page.get('section') index.props.pagenr = index.page = page.get('nr') index.doc = page yield index