def _extractor(txt, htmlpage=None): if txt is None: return m = ereg.search(txt) if m: return htmlregion(u"".join([g for g in m.groups() or m.group() if g]))
def _process_css_and_xpath(self, annotations, selector): schema, modifiers, page = self.schema, self.modifiers, self.htmlpage region_ids = list(filter(bool, (region_id(r) for r in self.regions))) query = ','.join(('[data-tagid="%s"]' % rid for rid in region_ids)) parents = {e._root for e in selector.css(query)} containers = () if self.parent_region: if isinstance(self.parent_region, list): pquery = ', '.join( '[data-tagid="{}"]'.format(self.get_region_id(r)) for r in self.parent_region) else: pquery = '[data-tagid="{}"]'.format( self.get_region_id(self.parent_region)) containers = {e._root for e in selector.css(pquery)} for i, a in enumerate(annotations, start=len(self.fields)): mode = a.get(u'selection_mode') query = a.get(mode if mode != 'css' else u'selector') try: elems = self._pick_elems( getattr(selector, mode)(query), parents, containers) except ValueError: continue for elem in elems: elem._root.attrib.pop('data-tagid', None) extracted = elems.xpath(self.attribute_query(a)).extract() value = list(map(six.text_type.strip, extracted)) aid = a.get(u'id') or i if value: value = [htmlregion(v) for v in arg_to_iter(value)] self.fields[aid] = ItemField(value, a, schema, modifiers, page) else: self.fields.pop(aid, None)
def _extractor(txt, htmlpage=None): if txt is None: return m = ereg.search(txt) if m: return htmlregion(u"".join( [g for g in m.groups() or m.group() if g]))
def safe_html(response): html_raw = response.xpath( '//div[contains(@class, "panel panel-default anenities")]' ).extract_first('').strip() html = replace_escape_chars(safehtml(htmlregion(html_raw))).replace( '<p></p>', '').strip() return html
def extract_content(self, selector): """ Extract Content of article @param selector Scrapy.Selector object (https://docs.scrapy.org/en/latest/topics/selectors.html) @return Text Content of article, return None if not found """ t = lambda s: text(htmlregion(s)) content = u' '.join(selector.css(self.config_selectors.get('ARTICLE_CONTENT')).extract()) return t(content)
def raw_to_text(txt): return _text(htmlregion(txt))
def _extractor(txt): m = ereg.search(txt) if m: return htmlregion(u"".join([g for g in m.groups() or m.group() if g]))
def _extractor(txt): m = ereg.search(txt) if m: return htmlregion(u"".join(filter(None, m.groups() or m.group())))
def extract_text(value): res = scrapely_extract_text(htmlregion(value)) return res