def listImg(html, baseUrl=None, start=None, end=None, before=None): if (not html or html.find("<img") < 0): return [] section = getSection(html, start, end, before) s = section[0] e = section[1] if (s < 0 or e < s): return None html = html[s:e] if (not html or html.find("<img") < 0): return None patternLst = _getRegex(u'<img[\s]+[^>]*>') patternUrl = _getRegex(u'src[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') patternTitle = _getRegex(u'alt[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]') lstStr = patternLst.findall(html) # lst=[] dic = Dict() for i in lstStr: url = None title = None tmp = patternUrl.search(i) if tmp: url = tmp.group("url") tmp = patternTitle.search(i) if tmp: title = tmp.group("title") try: if (url): url = url.strip().lower() if (baseUrl and url[:7] != "http://" and url[:8] != "https://"): absUrl = absoluteUrl(baseUrl, url) if (absUrl != url): d = dic[absUrl] if d: if not d.alt or (title and len(d.alt) < len(title)): d['alt'] = title else: dic[absUrl] = Dict({ 'url': absUrl, 'alt': title, 'relativeUrl': url }) # lst.append(Dict({'url':absUrl,'alt':title, 'relativeUrl':url})) else: d = dic[url] if d: if not d.alt or (title and len(d.alt) < len(title)): d['alt'] = title else: dic[url] = Dict({'url': url, 'alt': title}) # lst.append(Dict({'url':url,'alt':title})) except Exception as ex: printInfo(ex) return list(dic.values())
def listA(html,baseUrl=None,start=None,end=None,before=None): if(not html): return [] section = getSection(html,start,end,before) s = section[0] e = section[1] if(s < 0 or e < s): return [] html = html[s:e] if(not html or html.find("<a")<0): return [] patternLst = _getRegex(u'<a[\s]+[^>]*>[\s\S]*?</a>') patternUrl = _getRegex(u'href[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') patternTitle1 = _getRegex(u'title[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]') patternTitle2 = _getRegex(u'<a[\s]+[^>]*>(?P<title>.*?)</a>') strA = patternLst.findall(html) dic = Dict() for i in strA: url = None title = None tmp = patternUrl.search(i) if tmp: url = tmp.group("url") tmp = patternTitle1.search(i) if tmp: title = tmp.group("title") if(not title): tmp = patternTitle2.search(i) if tmp: title = tmp.group("title") title = _getRegex('<[^<>]+>').sub('',title) try: if(url): url = url.strip().lower() if(baseUrl and url[:7]!="http://" and url[:8]!="https://"): absUrl = absoluteUrl(baseUrl,url) if(absUrl!=url): d = dic[absUrl] if d: if not d.title or (title and len(d.title)<len(title)): d['title'] = title else: dic[absUrl] = Dict({'url':absUrl,'title':title,'relativeUrl':url}) else: if url.rfind('/')<9: url += '/' d = dic[url] if d: if not d.title or (title and len(d.title)<len(title)): d['title'] = title else: dic[url] = Dict({'url':url,'title':title}) except Exception as ex: printInfo(ex) return list(dic.values())
class MemCookieStore: _cookies = Dict() def __init__(self): pass def getCookie(self, url): domain = urlparse(url).netloc cookie = self._getCookie(domain) if (not cookie): start = domain.find('.') + 1 domain = domain[start:] cookie = self._getCookie(domain) return cookie def _getCookie(self, domain): cookie = self._cookies.get(domain) return cookie def setCookie(self, url, cookie): if (not cookie): return domain = urlparse(url).netloc kvs = {} old = self._getCookie(domain) if (old): self._parseCookie(old, kvs) if (isinstance(cookie, str)): self._parseCookie(cookie, kvs) else: for line in cookie: self._parseCookie(line, kvs) strCookie = self._dic2str(kvs) self._cookies[domain] = strCookie def _dic2str(self, dic): strs = None for k, v in dic.items(): if (strs): strs = u'{};{}={}'.format(strs, k, v) else: strs = u'{}={}'.format(k, v) return strs def _parseCookie(self, cookie, kvs): keys = cookie.split(';') for key in keys: if (not key): continue key = key.strip() kv = key.split('=') name = kv[0].strip().lower() flag = False for kp in ['expires', 'domain', 'path', 'secure', 'httponly']: if (name == kp): flag = True break if (flag): continue value = '' if (len(kv) > 1): value = kv[1].strip() kvs[kv[0].strip()] = value
def getElementByReg(self, regex, tag=None, start=None, end=None, before=None): if (not self['html']): return Dict() ele = getElementByReg(regex, tag, self['html'], start, end, before) return RegexDictNew(ele, root=self._rootNode, parent=self)
def getElementByText(self, text, tag=None, start=None, end=None, before=None): if (not self.html): return Dict() ele = getElementByText(text, tag, self.html, start, end, before) return RegexDictNew(ele, root=self._rootNode, parent=self)
def getElement(self, tag, attr='class', value=None, start=None, end=None, before=None): if (not self['html']): return Dict() ele = getElement(tag, attr, value, self['html'], start, end, before) return RegexDictNew(ele, root=self._rootNode, parent=self)
def RegexDictNew(dic, root, parent=None, s=None): if not dic: return Dict() ele = RegexDict(dic) _s = 0 if s != None: _s = s elif parent: _s = root.html.find('>', parent._start) + 1 ele._start = dic._start + _s ele._end = dic._end + _s ele._rootNode = root ele._parentNode = parent return ele
def __init__(self, parent_element): if parent_element.items(): self.update(Dict(parent_element.items())) flag = False for element in parent_element: flag = True if (not self.get(element.tag)): self.update({element.tag: []}) dic = self.getDic(element) self[element.tag].append(dic) count = len(element) if (count > 0): self.ele2arr(dic, element) if (not flag): self.update({'tag': parent_element.tag})
def _getParentStart(html, end, tag): start = end s = end dicTag = Dict() if not tag: while True: if s < 0: return None l = html.rfind('<', 0, s) r = html.rfind('>', 0, s) if (r < l or l < 0 or r < 0): return None tagHtml = html[l:r + 1] if tagHtml[1] == '/': s = l tmpTag = tagHtml[2:-1] if not dicTag[tmpTag]: dicTag[tmpTag] = 1 else: dicTag[tmpTag] = dicTag[tmpTag] + 1 continue tag = _getTag(tagHtml, r - l + 1, None, 0) if len(dicTag) > 0: if dicTag[tag]: dicTag[tag] = dicTag[tag] - 1 if not dicTag[tag]: del dicTag[tag] s = l continue if not tag: return None if _checkSingle(tagHtml, 0): start = html.rfind('<' + tag, 0, start) continue l = html.rfind('<' + tag, 0, start) r = html.rfind('</' + tag, 0, start) if l > r: return (l, tag) if r > l: start = l s = l else: return None else: while True: if start < 0: return None l = html.rfind('<' + tag, 0, start) r = html.rfind('</' + tag, 0, start) if l > r: return (l, tag) if r > l: start = l else: return None
def extract(self, url, html, ssp): mds = url.get("model") if (not mds): mds = ssp.models models = [] if (mds): for modelName in mds: m = ExtractModel.get(modelName) if (m): models.append(m) else: printInfo('no model ' + modelName) return ssp.extract(Dict(url), html, models, mds) # print (ExtractModel.auto_all)
def RegexDictNew(dic, root, parent=None, s=None): if not dic: return Dict() ele = RegexDict(dic) _s = 0 if not root: root = parent parent = None if s != None: _s = s elif parent: _s = root['html'].find('>', parent._start) + 1 ele._start = dic._start + _s ele._end = dic._end + _s ele._rootNode = root ele._parentNode = parent if root and root._editFlag: root._elements[ele._start] = ele return ele
def getElementByClass(self, className, start=None, end=None, before=None): if (not self['html']): return Dict() ele = getElementByClass(className, self['html'], start, end, before) return RegexDictNew(ele, root=self._rootNode, parent=self)
def getNext(self, tag=None): if (not self._rootNode): return Dict() html = self._rootNode['html'] eles = getNext4Ele(html, self, tag) return RegexDictNew(eles, root=self._rootNode, s=0) #self._end)
def getParent(self, tag=None): if (not self._rootNode): return Dict() html = self._rootNode['html'] ele = getParent4Ele(html, self, tag) return RegexDictNew(ele, root=self._rootNode)
def getElementByAttr(self, attr, value, start=None, end=None, before=None): if (not self.html): return Dict() ele = getElementByAttr(attr, value, self.html, start, end, before) return RegexDictNew(ele, root=self._rootNode, parent=self)
def convert2Dic(html): try: start = html.find('<') end = html.find('>') html = html[start + 1:end].strip('/').strip() html = re.sub('(\\s| )+', ' ', html, 0) html = re.sub('(\')+', '"', html, 0) html = re.sub('(=\s*")+', '="', html, 0) lstC = [] #list(html) N = len(html) i = 0 first = False flag = False while i < N: if html[i] == '"': lstC.append(html[i]) first = not first elif not first and html[i] == '=' and html[i + 1] != '"': lstC.append(html[i]) lstC.append('"') flag = True elif not first and flag and html[i] == ' ': flag = False lstC.append('"') lstC.append(html[i]) else: lstC.append(html[i]) i += 1 html = ''.join(lstC) paras = html.split('"') dic = Dict() lastP = None first = True for para in paras: if (first): first = False tmp = para.split() dic['tag'] = tmp[0] if (len(tmp) > 1): lastP = tmp[1].strip().strip('=').strip() continue if (lastP): if (not dic[lastP]): dic[lastP] = para else: dic[lastP] += ' ' dic[lastP] += para lastP = None elif para: if (para.find('=') > 0): lastP = para.strip().strip('=').strip() else: dic[para] = '' return dic except Exception as err: printInfo(err) try: tag = '' if (html.find('</') < 0 and html.find('/>') < 0): start = html.find('<') end = html.find(' ', start + 1) tag = '</' + html[start + 1:end] + '>' tree = ET.XML(html + tag) return XmlDictConfig(tree) except Exception as err: printInfo(err) return None
import json, re, importlib, os, sys from simplified_scrapy.core.request_helper import extractHtml from simplified_scrapy.core.utils import printInfo from simplified_scrapy.core.dictex import Dict ExtractModel = Dict({ 'auto_all': { "Type": 2, "UrlDomains": "all_domain" }, 'auto_lst_obj': { "Type": 5, "MergeUrl": False }, 'auto_lst_url': { "Type": 4, "MergeUrl": False }, 'auto_main_2': { "Type": 2 }, 'auto_main': { "Type": 2, "UrlDomains": "main_domain" }, 'auto_obj': { "Type": 3 } }) class Extracter: # _models={