示例#1
0
def listImg(html, baseUrl=None, start=None, end=None, before=None):
    if (not html or html.find("<img") < 0): return []
    section = getSection(html, start, end, before)
    s = section[0]
    e = section[1]

    if (s < 0 or e < s): return None
    html = html[s:e]
    if (not html or html.find("<img") < 0): return None

    patternLst = _getRegex(u'<img[\s]+[^>]*>')
    patternUrl = _getRegex(u'src[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]')
    patternTitle = _getRegex(u'alt[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
    lstStr = patternLst.findall(html)
    # lst=[]
    dic = Dict()
    for i in lstStr:
        url = None
        title = None
        tmp = patternUrl.search(i)
        if tmp:
            url = tmp.group("url")

        tmp = patternTitle.search(i)
        if tmp:
            title = tmp.group("title")
        try:
            if (url):
                url = url.strip().lower()
                if (baseUrl and url[:7] != "http://"
                        and url[:8] != "https://"):
                    absUrl = absoluteUrl(baseUrl, url)
                    if (absUrl != url):
                        d = dic[absUrl]
                        if d:
                            if not d.alt or (title
                                             and len(d.alt) < len(title)):
                                d['alt'] = title
                        else:
                            dic[absUrl] = Dict({
                                'url': absUrl,
                                'alt': title,
                                'relativeUrl': url
                            })
                        # lst.append(Dict({'url':absUrl,'alt':title, 'relativeUrl':url}))
                else:
                    d = dic[url]
                    if d:
                        if not d.alt or (title and len(d.alt) < len(title)):
                            d['alt'] = title
                    else:
                        dic[url] = Dict({'url': url, 'alt': title})
                    # lst.append(Dict({'url':url,'alt':title}))
        except Exception as ex:
            printInfo(ex)

    return list(dic.values())
示例#2
0
def listA(html,baseUrl=None,start=None,end=None,before=None):
  if(not html): return []
  section = getSection(html,start,end,before)
  s = section[0]
  e = section[1]
  if(s < 0 or e < s): return []
  html = html[s:e]
  if(not html or html.find("<a")<0): return []

  patternLst = _getRegex(u'<a[\s]+[^>]*>[\s\S]*?</a>')
  patternUrl = _getRegex(u'href[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') 
  patternTitle1 = _getRegex(u'title[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
  patternTitle2 = _getRegex(u'<a[\s]+[^>]*>(?P<title>.*?)</a>')

  strA = patternLst.findall(html)
  dic = Dict()
  for i in strA:
    url = None
    title = None
    tmp = patternUrl.search(i)
    if tmp: url = tmp.group("url")
      
    tmp = patternTitle1.search(i)
    if tmp: title = tmp.group("title")
    if(not title):
      tmp = patternTitle2.search(i)
      if tmp: 
        title = tmp.group("title")
        title = _getRegex('<[^<>]+>').sub('',title)

    try:
      if(url):
        url = url.strip().lower()
        if(baseUrl and url[:7]!="http://" and  url[:8]!="https://"):
          absUrl = absoluteUrl(baseUrl,url)
          if(absUrl!=url):
            d = dic[absUrl]
            if d:
              if not d.title or (title and len(d.title)<len(title)):
                d['title'] = title
            else:
              dic[absUrl] = Dict({'url':absUrl,'title':title,'relativeUrl':url})
        else:
          if url.rfind('/')<9:
            url += '/'
          d = dic[url]
          if d:
            if not d.title or (title and len(d.title)<len(title)):
              d['title'] = title
          else:
            dic[url] = Dict({'url':url,'title':title})
    except Exception as ex:
      printInfo(ex)
    
  return list(dic.values())
class MemCookieStore:
    _cookies = Dict()

    def __init__(self):
        pass

    def getCookie(self, url):
        domain = urlparse(url).netloc
        cookie = self._getCookie(domain)
        if (not cookie):
            start = domain.find('.') + 1
            domain = domain[start:]
            cookie = self._getCookie(domain)
        return cookie

    def _getCookie(self, domain):
        cookie = self._cookies.get(domain)
        return cookie

    def setCookie(self, url, cookie):
        if (not cookie): return
        domain = urlparse(url).netloc
        kvs = {}
        old = self._getCookie(domain)
        if (old):
            self._parseCookie(old, kvs)

        if (isinstance(cookie, str)):
            self._parseCookie(cookie, kvs)
        else:
            for line in cookie:
                self._parseCookie(line, kvs)
        strCookie = self._dic2str(kvs)
        self._cookies[domain] = strCookie

    def _dic2str(self, dic):
        strs = None
        for k, v in dic.items():
            if (strs):
                strs = u'{};{}={}'.format(strs, k, v)
            else:
                strs = u'{}={}'.format(k, v)
        return strs

    def _parseCookie(self, cookie, kvs):
        keys = cookie.split(';')
        for key in keys:
            if (not key): continue
            key = key.strip()
            kv = key.split('=')
            name = kv[0].strip().lower()
            flag = False
            for kp in ['expires', 'domain', 'path', 'secure', 'httponly']:
                if (name == kp):
                    flag = True
                    break
            if (flag): continue
            value = ''
            if (len(kv) > 1): value = kv[1].strip()
            kvs[kv[0].strip()] = value
示例#4
0
 def getElementByReg(self,
                     regex,
                     tag=None,
                     start=None,
                     end=None,
                     before=None):
     if (not self['html']): return Dict()
     ele = getElementByReg(regex, tag, self['html'], start, end, before)
     return RegexDictNew(ele, root=self._rootNode, parent=self)
示例#5
0
 def getElementByText(self,
                      text,
                      tag=None,
                      start=None,
                      end=None,
                      before=None):
     if (not self.html): return Dict()
     ele = getElementByText(text, tag, self.html, start, end, before)
     return RegexDictNew(ele, root=self._rootNode, parent=self)
示例#6
0
 def getElement(self,
                tag,
                attr='class',
                value=None,
                start=None,
                end=None,
                before=None):
     if (not self['html']): return Dict()
     ele = getElement(tag, attr, value, self['html'], start, end, before)
     return RegexDictNew(ele, root=self._rootNode, parent=self)
示例#7
0
def RegexDictNew(dic, root, parent=None, s=None):
    if not dic: return Dict()
    ele = RegexDict(dic)
    _s = 0
    if s != None: _s = s
    elif parent:
        _s = root.html.find('>', parent._start) + 1
    ele._start = dic._start + _s
    ele._end = dic._end + _s
    ele._rootNode = root
    ele._parentNode = parent
    return ele
示例#8
0
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(Dict(parent_element.items()))
        flag = False
        for element in parent_element:
            flag = True
            if (not self.get(element.tag)):
                self.update({element.tag: []})

            dic = self.getDic(element)
            self[element.tag].append(dic)
            count = len(element)
            if (count > 0):
                self.ele2arr(dic, element)
        if (not flag):
            self.update({'tag': parent_element.tag})
示例#9
0
def _getParentStart(html, end, tag):
    start = end
    s = end
    dicTag = Dict()
    if not tag:
        while True:
            if s < 0: return None
            l = html.rfind('<', 0, s)
            r = html.rfind('>', 0, s)
            if (r < l or l < 0 or r < 0): return None
            tagHtml = html[l:r + 1]
            if tagHtml[1] == '/':
                s = l
                tmpTag = tagHtml[2:-1]
                if not dicTag[tmpTag]:
                    dicTag[tmpTag] = 1
                else:
                    dicTag[tmpTag] = dicTag[tmpTag] + 1
                continue
            tag = _getTag(tagHtml, r - l + 1, None, 0)
            if len(dicTag) > 0:
                if dicTag[tag]:
                    dicTag[tag] = dicTag[tag] - 1
                    if not dicTag[tag]:
                        del dicTag[tag]
                s = l
                continue
            if not tag: return None
            if _checkSingle(tagHtml, 0):
                start = html.rfind('<' + tag, 0, start)
                continue
            l = html.rfind('<' + tag, 0, start)
            r = html.rfind('</' + tag, 0, start)
            if l > r: return (l, tag)
            if r > l:
                start = l
                s = l
            else:
                return None
    else:
        while True:
            if start < 0: return None
            l = html.rfind('<' + tag, 0, start)
            r = html.rfind('</' + tag, 0, start)
            if l > r: return (l, tag)
            if r > l: start = l
            else: return None
示例#10
0
    def extract(self, url, html, ssp):
        mds = url.get("model")
        if (not mds):
            mds = ssp.models
        models = []
        if (mds):
            for modelName in mds:
                m = ExtractModel.get(modelName)
                if (m):
                    models.append(m)
                else:
                    printInfo('no model ' + modelName)

        return ssp.extract(Dict(url), html, models, mds)


# print (ExtractModel.auto_all)
示例#11
0
def RegexDictNew(dic, root, parent=None, s=None):
    if not dic: return Dict()
    ele = RegexDict(dic)
    _s = 0
    if not root:
        root = parent
        parent = None

    if s != None: _s = s
    elif parent:
        _s = root['html'].find('>', parent._start) + 1
    ele._start = dic._start + _s
    ele._end = dic._end + _s
    ele._rootNode = root
    ele._parentNode = parent
    if root and root._editFlag:
        root._elements[ele._start] = ele
    return ele
示例#12
0
 def getElementByClass(self, className, start=None, end=None, before=None):
     if (not self['html']): return Dict()
     ele = getElementByClass(className, self['html'], start, end, before)
     return RegexDictNew(ele, root=self._rootNode, parent=self)
示例#13
0
 def getNext(self, tag=None):
     if (not self._rootNode): return Dict()
     html = self._rootNode['html']
     eles = getNext4Ele(html, self, tag)
     return RegexDictNew(eles, root=self._rootNode, s=0)  #self._end)
示例#14
0
 def getParent(self, tag=None):
     if (not self._rootNode): return Dict()
     html = self._rootNode['html']
     ele = getParent4Ele(html, self, tag)
     return RegexDictNew(ele, root=self._rootNode)
示例#15
0
 def getElementByAttr(self, attr, value, start=None, end=None, before=None):
     if (not self.html): return Dict()
     ele = getElementByAttr(attr, value, self.html, start, end, before)
     return RegexDictNew(ele, root=self._rootNode, parent=self)
示例#16
0
def convert2Dic(html):
    try:
        start = html.find('<')
        end = html.find('>')
        html = html[start + 1:end].strip('/').strip()
        html = re.sub('(\\s|&nbsp;)+', ' ', html, 0)
        html = re.sub('(\')+', '"', html, 0)
        html = re.sub('(=\s*")+', '="', html, 0)
        lstC = []  #list(html)
        N = len(html)
        i = 0
        first = False
        flag = False
        while i < N:
            if html[i] == '"':
                lstC.append(html[i])
                first = not first
            elif not first and html[i] == '=' and html[i + 1] != '"':
                lstC.append(html[i])
                lstC.append('"')
                flag = True
            elif not first and flag and html[i] == ' ':
                flag = False
                lstC.append('"')
                lstC.append(html[i])
            else:
                lstC.append(html[i])
            i += 1
        html = ''.join(lstC)
        paras = html.split('"')
        dic = Dict()
        lastP = None
        first = True
        for para in paras:
            if (first):
                first = False
                tmp = para.split()
                dic['tag'] = tmp[0]
                if (len(tmp) > 1):
                    lastP = tmp[1].strip().strip('=').strip()
                continue
            if (lastP):
                if (not dic[lastP]):
                    dic[lastP] = para
                else:
                    dic[lastP] += ' '
                    dic[lastP] += para
                lastP = None
            elif para:
                if (para.find('=') > 0):
                    lastP = para.strip().strip('=').strip()
                else:
                    dic[para] = ''
        return dic
    except Exception as err:
        printInfo(err)
        try:
            tag = ''
            if (html.find('</') < 0 and html.find('/>') < 0):
                start = html.find('<')
                end = html.find(' ', start + 1)
                tag = '</' + html[start + 1:end] + '>'
            tree = ET.XML(html + tag)
            return XmlDictConfig(tree)
        except Exception as err:
            printInfo(err)
    return None
示例#17
0
import json, re, importlib, os, sys
from simplified_scrapy.core.request_helper import extractHtml
from simplified_scrapy.core.utils import printInfo
from simplified_scrapy.core.dictex import Dict
ExtractModel = Dict({
    'auto_all': {
        "Type": 2,
        "UrlDomains": "all_domain"
    },
    'auto_lst_obj': {
        "Type": 5,
        "MergeUrl": False
    },
    'auto_lst_url': {
        "Type": 4,
        "MergeUrl": False
    },
    'auto_main_2': {
        "Type": 2
    },
    'auto_main': {
        "Type": 2,
        "UrlDomains": "main_domain"
    },
    'auto_obj': {
        "Type": 3
    }
})


class Extracter:
    # _models={