Exemplo n.º 1
0
 def parse(self, source = None, ruleset = None):
     """
     根据规则解析内容
     """
     if not source:
         source =self.source
     if not ruleset:
         ruleset = self.ruleset
     if isinstance(source, bytes):
         try:
             source = utils.decode(source)
         except:
             source = str(source)
     # self.info("Json source: %s" % re.sub(r"(\r|\n|\s{2,})", "", str(source)))
     # self.info("Json ruleset: %s" % str(ruleset))
     if source and ruleset:
         source = source.strip()
         ll = source[0:1]
         rl = source[-1:]
         if ll == '{' and rl == '}':
             data = self._jsonparse(source, ruleset)
         elif ll == '[' and rl == ']':
             data = self._jsonparse(source, ruleset)
         else:
             haslp = source.find('(')
             hasrp = source.rfind(')')
             if haslp == -1 or hasrp == -1:
                 raise CDSpiderParserJsonLoadFaild('Invalid json data: %s' % (re.sub(r'\s{2,}', ' ', str(source).replace("\r", "").replace("\n",""))), rule = ruleset)
             data = self._jsonparse(source[haslp+1:hasrp], ruleset)
         return data
     return source
Exemplo n.º 2
0
 def parser_time(html, now=False):
     rule1 = '((?:(?:(?:(?:20)?[012]\d)|(?:19)?[98]\d)年\d{1,2}月\d{1,2}日?(?:\D*\d{2}:\d{2}(?::\d{2})?)?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)-\d{1,2}-\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)\/\d{1,2}\/\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?))'
     rule11 = '\>\s*' + rule1 + '[\s&\<]'
     rule12 = '[\>\s;](?:(?:本文)?发布于|发表于|发布(?:时间|日期)|日期|时间)\s*(?:\:|:)?\s*' + rule1
     rule13 = rule1 + '\s*发布'
     rule14 = '[\s\>;]((?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)-\d{1,2}-\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)\/\d{1,2}\/\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?))(?:\040|&nbsp;)+\<'
     rule15 = '((?:(?:(?:(?:20)?[012]\d)|(?:19)?[98]\d)年\d{1,2}月\d{1,2}日\D?\d{2}:\d{2}(?::\d{2})?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)-\d{1,2}-\d{1,2} \d{1,2}:\d{2}(?::\d{2})?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)\/\d{1,2}\/\d{1,2} \d{1,2}:\d{2}(?::\d{2})?))'
     if isinstance(html, bytes):
         try:
             html = decode(html)
         except:
             html = str(html)
     if html:
         g1 = re.findall(rule12, html) or re.findall(
             rule13, html) or re.findall(rule15, html) or re.findall(
                 rule11, html) or re.findall(rule14, html)
         if len(g1) == 1:
             return g1[0]
         elif len(g1) > 1:
             for item in g1:
                 if item.find(':') > 0:
                     return item
     if now:
         return time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time()))
     return None
Exemplo n.º 3
0
    def parse(self, source=None, ruleset=None):
        if not source:
            source = self.source
        if not ruleset:
            ruleset = self.ruleset
        if not ruleset:
            ruleset = {}
        if ('filter' in ruleset and ruleset['filter'] and ruleset['filter'].startswith('@json:'))\
                or (((ruleset.get('item') or {}).get("url") or {}).get("filter") or '').startswith('@json'):
            parser = JsonParser(source=source,
                                ruleset=ruleset,
                                logger=self.logger,
                                domain=self.domain,
                                subdomain=self.subdomain)
            return parser.parse()
        g = Goose({
            "target_language": "zh",
            'stopwords_class': StopWordsChinese,
            "enable_fewwords_paragraphs": True,
            "logger": self.logger,
            "domain": self.domain,
            "subdomain": self.subdomain,
            "custom_rule": ruleset if ruleset else {}
        })

        if isinstance(source, bytes):
            try:
                catalogue = g.fetch(raw_html=utils.decode(source),
                                    encoding='UTF8')
            except:
                catalogue = g.fetch(raw_html=source)
        else:
            catalogue = g.fetch(raw_html=source, encoding='UTF8')
        data = catalogue.infos
        return data
Exemplo n.º 4
0
    def parse(self, source=None, ruleset=None):
        if not source:
            source = self.source
        if not ruleset:
            ruleset = self.ruleset
        if not ruleset:
            ruleset = {}
        if "item" in ruleset:
            if 'filter' in ruleset and ruleset['filter'] and ruleset[
                    'filter'].startswith('@json:'):
                parser = JsonParser(source=source,
                                    ruleset=ruleset,
                                    logger=self.logger,
                                    domain=self.domain,
                                    subdomain=self.subdomain)
                return parser.parse()
            else:
                ruleset['onlyOne'] = 0
        elif ruleset:
            rule = list(ruleset.values())[0]
            if 'filter' in rule and rule['filter'] and rule[
                    'filter'].startswith('@json:'):
                parser = JsonParser(source=source,
                                    ruleset=ruleset,
                                    logger=self.logger,
                                    domain=self.domain,
                                    subdomain=self.subdomain)
                return parser.parse()
        onlyOne = ruleset.pop('onlyOne', 1)
        g = Goose({
            "target_language": "zh",
            'stopwords_class': StopWordsChinese,
            "enable_fewwords_paragraphs": True,
            "logger": self.logger,
            "domain": self.domain,
            "subdomain": self.subdomain,
            "custom_rule": ruleset if ruleset else {},
            "final_url": self.final_url
        })

        if isinstance(source, bytes):
            try:
                catalogue = g.parse(raw_html=utils.decode(source),
                                    encoding='UTF8')
            except:
                catalogue = g.parse(raw_html=source)
        else:
            catalogue = g.parse(raw_html=source, encoding='UTF8')
        data = catalogue.infos
        if onlyOne:
            return data[0] if (isinstance(data, list) and data) else data
        return data
Exemplo n.º 5
0
 def parse(self, source=None, ruleset=None):
     if not source:
         source = self.source
     if not ruleset:
         ruleset = self.ruleset
     if isinstance(source, bytes):
         try:
             source = decode(source)
         except:
             source = str(source)
     if source and ruleset:
         data = {}
         for key in ruleset:
             data[key] = self._filter(source, ruleset[key])
         return data
     return None
Exemplo n.º 6
0
 def parse(self, source = None, ruleset = None):
     """
     根据规则解析内容
     """
     if not source:
         source =self.source
     if not ruleset:
         ruleset = self.ruleset
     if isinstance(source, bytes):
         try:
             source = decode(source)
         except:
             pass
     # self.info("Pyquery source: %s" % re.sub(r"(\r|\n|\s{2,})", "", str(content)))
     # self.info("Pyquery ruleset: %s" % str(ruleset))
     if source and ruleset:
         pq = PyQuery(source)
         data = {}
         for k in ruleset:
             if not ruleset[k]:
                 data[k] = source
                 continue
             if 'type' in ruleset[k] or 'filter' in ruleset[k]:
                 self.info("Pyquery rule: %s" % str(ruleset[k]))
                 data[k] = self._filter(pq, ruleset[k])
                 self.info("Pyquery data: %s" % str(data[k]))
             else:
                 rule = ruleset[k]
                 if isinstance(rule, list):
                     for r in rule:
                         self.info("rule: %s" % str(r))
                         item = self._filter(pq, r)
                         self.info("data: %s" % str(item))
                         if item:
                             data[k] = item
                             break
                 else:
                     rest = {}
                     for idx in rule:
                         self.info("Pyquery rule: %s=>%s" % (idx, str(rule[idx])))
                         rest[idx] = self._filter(pq, rule[idx])
                         self.info("Pyquery data: %s=>%s" % (idx, str(rest[idx])))
                     data[k] = rest
         return data
     return source
Exemplo n.º 7
0
    def parse(self, source=None, ruleset=None):
        if not source:
            source = self.source
        if not ruleset:
            ruleset = self.ruleset
        if not ruleset:
            ruleset = {}
        item_ruleset = dict(
            [(k, item) for k, item in ruleset.items()
             if item and isinstance(item, dict) and 'filter' in item
             and item['filter']]) if ruleset else {}
        if item_ruleset:
            rule = list(item_ruleset.values())[0]
            if 'filter' in rule and rule['filter'] and rule[
                    'filter'].startswith('@json:'):
                parser = JsonParser(source=source,
                                    ruleset=item_ruleset,
                                    logger=self.logger,
                                    domain=self.domain,
                                    subdomain=self.subdomain)
                return parser.parse()
        local_storage_path = self._settings.get('attach_storage', None)
        g = Goose({
            "target_language": "zh",
            'stopwords_class': StopWordsChinese,
            "enable_fewwords_paragraphs": True,
            "logger": self.logger,
            "domain": self.domain,
            "subdomain": self.subdomain,
            "custom_rule": item_ruleset if item_ruleset else {},
            "local_storage_path": local_storage_path,
            "final_url": self.final_url
        })

        if isinstance(source, bytes):
            try:
                article = g.extract(raw_html=utils.decode(source),
                                    encoding='UTF8')
            except UnicodeDecodeError:
                article = g.extract(raw_html=source)
        else:
            article = g.extract(raw_html=source, encoding='UTF8')
        data = {}
        for i in ruleset:
            if i == 'title':
                data[i] = article.infos['title']['clean_title']
            elif i == 'content':
                data[i] = article.infos['cleaned_text']
                data["raw_content"] = '\r\n'.join(
                    article.top_node_html) if isinstance(
                        article.top_node_html,
                        (list, tuple)) else article.top_node_html
                data["raw_content"] = re.sub('[\u3000\xa0]', ' ',
                                             str(data["raw_content"]))
            elif i == 'pubtime':
                data[i] = article.infos['publish_date']
            elif i == 'author':
                data[i] = self.get_author(article.infos['authors'])
            else:
                data[i] = article.infos.get(i, None)
        if 'title' not in data:
            data['title'] = article.infos['title']['clean_title']
        if 'content' not in data:
            data['content'] = article.infos['cleaned_text']
            data["raw_content"] = '\r\n'.join(
                article.top_node_html) if isinstance(
                    article.top_node_html,
                    (list, tuple)) else article.top_node_html
            data["raw_content"] = re.sub('[\u3000\xa0]', ' ',
                                         str(data["raw_content"]))
        if 'pubtime' not in data:
            data['pubtime'] = article.infos['publish_date']
        if 'author' not in data:
            data['author'] = self.get_author(article.infos['authors'])
        return data