def _match(self, pq, onlyOne = True, callback = None, target = None, llimiter = None, rlimiter = None): """ 正则匹配元素 """ if not isinstance(pq, PyQuery): return None if pq.length == 0: return None if not llimiter: llimiter = '' if not rlimiter: rlimiter = '' pattern = pcre2re(target) if onlyOne: for i in range(0, pq.length): text = pq.eq(i).outer_html() if text: m = pattern.search(text) if m: return callback_result(callback, llimiter + m.group(1) + rlimiter) else: data = [] for i in range(0, pq.length): text = pq.eq(i).text() if text: m = pattern.search(text) if m: data.append(callback_result(callback, llimiter + m.group(1) + rlimiter)) return data return None
def _htmlparse(self, pq, rule): """ �则解析 """ if 'grep' in rule: pq = self._grep(pq, rule['grep']) if 'not_' in rule: pq = pq.not_(rule['not_']) if 'is_' in rule: pq = pq.is_(rule['is_']) if 'eq' in rule: idx = int(rule['eq']) or 0 if idx < 0: idx = pq.length + idx pq = pq.eq(idx) if 'type' in rule: onlyOne = int(rule.get('onlyOne', 1)) target = rule.get('target', None) callback = rule.get('callback', None) llimiter = rule.get('llimiter', None) rlimiter = rule.get('rlimiter', None) proccessFun = getattr(self, '_%s' % str(rule['type'])) content = proccessFun(pq, target=target, onlyOne=onlyOne, callback=callback, llimiter=llimiter, rlimiter=rlimiter) if 'match' in rule and rule['match']: redata = pcre2re(rule['match']).search(content) if not redata: return None if 'mkey' in rule: rst = ((rule['mkey'] in redata.groups()) and redata.group(rule['mkey']) or None) if isinstance(rst, six.string_types): return self.patch_result(extract_result(rst, rule, None), rule, None) return rst rst = redata.group(1) if isinstance(rst, six.string_types): return self.patch_result(extract_result(rst, rule, None), rule, None) return rst if isinstance(content, six.string_types): return self.patch_result(extract_result(content, rule, None), rule, None) return content elif 'item' in rule: onlyOne = int(rule.get('onlyOne', 0)) if onlyOne: parser = PyqueryParser(rule['item'], str(pq)) return parser.parse() else: data = [] for i in range(pq.length): parser = PyqueryParser(rule['item'], str(pq.eq(i))) data.append(parser.parse()) return data else: return pq
def process_fun(el, rule): source = self._htmlparse(el, rule) if match: if source: d = pcre2re(match).search(source) if d: return True elif params: if params == source: return True else: if source: return True return False
def _text(self, elements, **kwargs): for element in elements: text = element.text if 'match' in kwargs and kwargs['match']: pattern = utils.pcre2re(kwargs['text']) if text and pattern.search(text) and self._getable( kwargs, element): return element elif 'partial' in kwargs and kwargs['partial']: if text and text.find(kwargs['text']) != -1 and self._getable( kwargs, element): return element else: if text == kwargs['text'] and self._getable(kwargs, element): return element
def _css(self, elements, **kwargs): if 'val' not in kwargs: raise CDSpiderSettingError('Selenium val must be not none', self._base_url, self.final_url, rule=kwargs) for element in elements: css = element.value_of_css_property(kwargs['css']) if 'match' in kwargs and kwargs['match']: pattern = utils.pcre2re(kwargs['val']) if css and pattern.search(css) and self._getable( kwargs, element): return element elif 'partial' in kwargs and kwargs['partial']: if css and css.find(kwargs['val']) != -1 and self._getable( kwargs, element): return element else: if css == kwargs['val'] and self._getable(kwargs, element): return element
def _attr(self, elements, **kwargs): if 'val' not in kwargs: raise CDSpiderSettingError('Selenium val must be not none', self._base_url, self.final_url, rule=kwargs) for element in elements: attr = element.get_attribute(kwargs['attr']) if 'match' in kwargs and kwargs['match']: pattern = utils.pcre2re(kwargs['val']) if attr and pattern.search(attr) and self._getable( kwargs, element): return element elif 'partial' in kwargs and kwargs['partial']: if attr and attr.find(kwargs['val']) != -1 and self._getable( kwargs, element): return element else: if attr == kwargs['val'] and self._getable(kwargs, element): return element