def parseJsonPage(self, site, doc, listurl): try: doc = json.loads(doc, encoding=site.getCharset()) item = self.listRule.getEntryItem() if item and item in doc: data = doc[item] else: data = doc urlParent = self.listRule.getContentUrl() extrarules = self.listRule.extrarules if isinstance(data, list) and urlParent: for _data in data: if urlParent in _data: link = urlparse.urljoin(listurl, _data[urlParent]) guid = md5(link).hexdigest() _item = Item({ "type" : self.seed_type, "images" : [] }) #取出需要的key数据 for field_id, _rule, fetch_all in extrarules: field = Field(field_id = field_id, rule=_rule) if _rule in _data: value = _data[_rule] if is_image(value): _item["images"].append(value) field.value = value _item[field["name"]] = field if (link is not None): _item['url'] = link # get item guid if self.guid_rule: guid = self.getItemGUID(_item) elif self.seed_type in self.dont_craw_content: self.guid_rule = [] for f in _item.fields: self.guid_rule.append(_item[f]["id"]) guid = self.getItemGUID(_item) self.guid_rule = None else: self.guid_rule = "url" guid = self.getItemGUID(_item) self.guid_rule = None self.items[guid] = _item except: raise "Cant parse json file"
def parseDocument(self, doc): doc = pq(doc); wrapparent = self.articleRule.wrapparent pageparent = self.articleRule.pageparent content_re = ""; #子页面url urls = [] #文本数据内容 content = "" article = doc.find(wrapparent); #pages if pageparent: urls = self.parsePage(article, pageparent) #need title, tags extrarules = self.articleRule.extrarules #只有文章是有content if len(extrarules): for key, rule, fetch_all in extrarules: field = Field(field_id=key, rule=rule); value = getElementData(doc, rule, self.data["images"], fetch_all) self.data[field.get('name')] = field if field.is_article_content(): content_re = field.get("rule") content = value elif field.is_gallery_content(): content_re = field.get("rule") content = [] if (isinstance(value, list)): content += value else: field.value = value #采集分页内容 if len(urls) > 0 and content_re: for next_url in urls: next_page = Fetch(next_url, charset = self.seed["charset"], timeout = self.seed["timeout"]).read() if next_page is not None: next_page = self._getContent(next_page, wrapparent, content_re); if next_page: if isinstance(content, list): content.append(next_page) else: content += next_page if content and content_re: if isinstance(content, list): self.data['content'].value = content self.data['images'] += content else: content = Readability(content, self.url, self.articleRule.filters) images = content.getImages(); self.data['content'].value = content.getContent(); self.data['images'] += images
def entry(i, e): #link urlParent = self.listRule.getContentUrl() if e.tag == "a": link = e.get("href") else: link = getElementData(e, urlParent) if link is not None: link = urlparse.urljoin(listurl, link); _item = Item({ "type" : self.seed_type, "images" : [] }) for field_id, _rule, fetch_all in extrarules: field = Field(field_id = field_id, rule=_rule) value = getElementData(e, _rule, _item["images"]) #TODO: # filter HOOK field.value = value _item[field["name"]] = field if (link is not None): _item['url'] = link # get item guid if self.guid_rule: guid = self.getItemGUID(_item) elif self.seed_type in self.dont_craw_content: self.guid_rule = [] for f in _item.fields: self.guid_rule.append(_item[f]["id"]) guid = self.getItemGUID(_item) self.guid_rule = None else: self.guid_rule = "url" guid = self.getItemGUID(_item) self.guid_rule = None self.items[guid] = _item
def _testField(self): f = Field(name='Title', id="title", rule='h1.text()', type='list', other='222')
def _testItem(self): test_item = Item(test="a", ddd="ff") "ddd" in test_item "adads" in test_item test_item["aaa"] = "c" test_item['f1'] = Field(name='title', rule='b')