Python printInfo示例，simplified_scrapy.core.utils.printInfo Python示例

示例#1

0

显示文件

文件： sqlite_requesttm.py 项目： yiyedata/simplified-scrapy

 def __init__(self, name, setting={}):
     if setting:
         if setting.get('port'):
             self._port = setting.get('port')
     if (not os.path.exists('db/')):
         os.mkdir('db/')
     self._name = name
     self._dbPath = self._dbPath.format(name)
     conn = None
     try:
         conn = sqlite3.connect(self._dbPath)
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS {}
         (id INTEGER PRIMARY KEY AUTOINCREMENT,
         name VARCHAR(50) NOT NULL,
         method VARCHAR(10) NOT NULL,
         url TEXT NOT NULL,
         tmSpan REAL NOT NULL,
         concurrency INT NOT NULL DEFAULT 0,
         countPer10s INT NOT NULL DEFAULT 0,
         state INT NOT NULL DEFAULT 0,
         size INT NOT NULL DEFAULT 0,
         tm CHAR(20));'''.format(self._tbName))
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     if (conn): conn.close()

示例#2

0

显示文件

 def downloadThread2(self, *args):
     url = args[0]
     ssp = args[1]
     try:
         self._concurrency -= 1
         printInfo(url['url'])
         self._downloadPageNum += 1
         startTm = time.time()
         url['_startTm'] = startTm
         html = execDownload(url, ssp)
         url['_endTm'] = time.time()
         tmSpan = url['_endTm'] - startTm
         state = 1
         if (html == "_error_"):
             state = 0
             ssp.downloadError(url)
         else:
             data = ssp.saveHtml(url, html)
             if data and (isinstance(data, dict) or isinstance(data, list)):
                 self._extractQueue.put((ssp, data))
         if self.statistics:
             curIndex = int(time.time() / 10) % 2
             self.statistics.addRecode(
                 ssp, url, tmSpan, state, self._concurrency,
                 self._downloadPagePer10s[1 - curIndex],
                 len(html) if html and state else 0)
     except Exception as err:
         self.log(err, logging.ERROR)

示例#3

0

显示文件

文件： url_store.py 项目： yiyedata/simplified-scrapy

 def __init__(self, name):
   try:
     self._urlFilename=self._urlFilename.format(name)
     self._dicFilename=self._dicFilename.format(name)
     self._indexFilename=self._indexFilename.format(name)
     if(not os.path.exists('db/')):
       os.mkdir('db/')
     self._urlfile = io.open(self._urlFilename, "a+",encoding="utf-8")
     self._dicfile = io.open(self._dicFilename, "a+",encoding="utf-8")
     self._indexfile = io.open(self._indexFilename, "a+",encoding="utf-8")
     self._urlfile.seek(0)
     self._dicfile.seek(0)
     self._indexfile.seek(0)
     index = self._indexfile.read()
     if(index):
       self._i = int(index)
     line = 'start'
     while(line):
       line = self._dicfile.readline()
       if(line):
         self._dic.add(line[:-1])
     i = 0
     line = 'start'
     while(line):
       line = self._urlfile.readline()
       if(i<self._i):
         i=i+1
         continue
       if(line):
         self._urls.append(json.loads(line[:-1]))
   except Exception as err:
     printInfo(err)

示例#4

0

显示文件

 def log(self, msg, level=logging.DEBUG):
     if (isinstance(msg, UnicodeEncodeError)):
         printInfo('UnicodeEncodeError', msg)
         return
     printInfo(msg)
     if (level == logging.ERROR):
         logger = logging.getLogger()
         logging.LoggerAdapter(logger, None).log(level, msg)

示例#5

0

显示文件

文件： sqlite_requesttm.py 项目： yiyedata/simplified-scrapy

 def clearRecode(self):
     conn = sqlite3.connect(self._dbPath)
     try:
         cursor = conn.cursor()
         cursor.execute("delete from request_tm")
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     conn.close()

示例#6

0

显示文件

def listImg(html, baseUrl=None, start=None, end=None, before=None):
    if (not html or html.find("<img") < 0): return []
    section = getSection(html, start, end, before)
    s = section[0]
    e = section[1]

    if (s < 0 or e < s): return None
    html = html[s:e]
    if (not html or html.find("<img") < 0): return None

    patternLst = _getRegex(u'<img[\s]+[^>]*>')
    patternUrl = _getRegex(u'src[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]')
    patternTitle = _getRegex(u'alt[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
    lstStr = patternLst.findall(html)
    # lst=[]
    dic = Dict()
    for i in lstStr:
        url = None
        title = None
        tmp = patternUrl.search(i)
        if tmp:
            url = tmp.group("url")

        tmp = patternTitle.search(i)
        if tmp:
            title = tmp.group("title")
        try:
            if (url):
                url = url.strip().lower()
                if (baseUrl and url[:7] != "http://"
                        and url[:8] != "https://"):
                    absUrl = absoluteUrl(baseUrl, url)
                    if (absUrl != url):
                        d = dic[absUrl]
                        if d:
                            if not d.alt or (title
                                             and len(d.alt) < len(title)):
                                d['alt'] = title
                        else:
                            dic[absUrl] = Dict({
                                'url': absUrl,
                                'alt': title,
                                'relativeUrl': url
                            })
                        # lst.append(Dict({'url':absUrl,'alt':title, 'relativeUrl':url}))
                else:
                    d = dic[url]
                    if d:
                        if not d.alt or (title and len(d.alt) < len(title)):
                            d['alt'] = title
                    else:
                        dic[url] = Dict({'url': url, 'alt': title})
                    # lst.append(Dict({'url':url,'alt':title}))
        except Exception as ex:
            printInfo(ex)

    return list(dic.values())

示例#7

0

显示文件

def listA(html,baseUrl=None,start=None,end=None,before=None):
  if(not html): return []
  section = getSection(html,start,end,before)
  s = section[0]
  e = section[1]
  if(s < 0 or e < s): return []
  html = html[s:e]
  if(not html or html.find("<a")<0): return []

  patternLst = _getRegex(u'<a[\s]+[^>]*>[\s\S]*?</a>')
  patternUrl = _getRegex(u'href[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') 
  patternTitle1 = _getRegex(u'title[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
  patternTitle2 = _getRegex(u'<a[\s]+[^>]*>(?P<title>.*?)</a>')

  strA = patternLst.findall(html)
  dic = Dict()
  for i in strA:
    url = None
    title = None
    tmp = patternUrl.search(i)
    if tmp: url = tmp.group("url")
      
    tmp = patternTitle1.search(i)
    if tmp: title = tmp.group("title")
    if(not title):
      tmp = patternTitle2.search(i)
      if tmp: 
        title = tmp.group("title")
        title = _getRegex('<[^<>]+>').sub('',title)

    try:
      if(url):
        url = url.strip().lower()
        if(baseUrl and url[:7]!="http://" and  url[:8]!="https://"):
          absUrl = absoluteUrl(baseUrl,url)
          if(absUrl!=url):
            d = dic[absUrl]
            if d:
              if not d.title or (title and len(d.title)<len(title)):
                d['title'] = title
            else:
              dic[absUrl] = Dict({'url':absUrl,'title':title,'relativeUrl':url})
        else:
          if url.rfind('/')<9:
            url += '/'
          d = dic[url]
          if d:
            if not d.title or (title and len(d.title)<len(title)):
              d['title'] = title
          else:
            dic[url] = Dict({'url':url,'title':title})
    except Exception as ex:
      printInfo(ex)
    
  return list(dic.values())

示例#8

0

显示文件

文件： sqlite_requesttm.py 项目： yiyedata/simplified-scrapy

 def _insertRT(self, datas):
     conn = sqlite3.connect(self._dbPath)
     try:
         cursor = conn.cursor()
         cursor.executemany(
             "insert into request_tm(name,method,url,tmSpan,concurrency,countPer10s,state,size,tm) values(?,?,?,?,?,?,?,?,?)",
             tuple(datas))
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     conn.close()

示例#9

0

显示文件

文件： sqlite_requesttm.py 项目： yiyedata/simplified-scrapy

 def _select(self, sql):
     conn = sqlite3.connect(self._dbPath)
     rows = []
     header = []
     try:
         cursor = conn.cursor().execute(sql)
         for i in cursor.description:
             header.append(i[0])
         for row in cursor:
             rows.append(row)
     except Exception as err:
         printInfo('error', err)
     conn.close()
     return (rows, header)

示例#10

0

显示文件

文件： sqlite_requesttm.py 项目： yiyedata/simplified-scrapy

 def _dealRequestTmThread(self):
     while (self._runflag):
         try:
             datas = []
             i = 0
             while (True):
                 i += 1
                 if self._requestTmq.empty():
                     break
                 data = self._requestTmq.get_nowait()
                 datas.append(data)
                 if i > 1000: break
             if datas: self._insertRT(datas)
         except Exception as err:
             printInfo('error', err)
         time.sleep(1)

示例#11

0

显示文件

    def extract(self, url, html, ssp):
        mds = url.get("model")
        if (not mds):
            mds = ssp.models
        models = []
        if (mds):
            for modelName in mds:
                m = ExtractModel.get(modelName)
                if (m):
                    models.append(m)
                else:
                    printInfo('no model ' + modelName)

        return ssp.extract(Dict(url), html, models, mds)


# print (ExtractModel.auto_all)

示例#12

0

显示文件

 def _iniModels(self, rootdir):
     try:
         if (not os.path.isdir(rootdir)):
             return
         list = os.listdir(rootdir)
         for i in range(0, len(list)):
             name = list[i]
             path = os.path.join(rootdir, name)
             if os.path.isfile(path):
                 f = open(path, 'r')
                 if sys.version_info.major == 2:
                     ExtractModel[name[:-5]] = json.loads(
                         f.read().decode('utf-8'))
                 else:
                     ExtractModel[name[:-5]] = json.loads(f.read())
                 f.close()
     except Exception as err:
         printInfo(err)

示例#13

0

显示文件

文件： url_store.py 项目： yiyedata/simplified-scrapy

 def resetUrls(self, urls):
   self._lock.acquire()
   try:
     flag=False
     for url in urls:
       if(not isinstance(url,dict)):
         url={'url':url}
       id=md5(url['url'])
       self._urls.append(url)
       self._dic.add(id)
       self._writeFile(url,id)
       flag=True
     if(flag):
       self._flushFile()
   except Exception as err:
     printInfo(err)
   finally:
     self._lock.release()

示例#14

0

显示文件

 def __init__(self):
     try:
         if (not os.path.exists('db/')):
             os.mkdir('db/')
         self._cookiefile = io.open(self._cookieFilename,
                                    "a+",
                                    encoding="utf-8")
         self._cookiefile.seek(0)
         line = 'start'
         while (line):
             line = self._cookiefile.readline()
             if (line):
                 line = line[:-1]
                 start = line.index(',')
                 if (start > 0):
                     self._cookies[line[0:start]] = line[start + 1:]
         self._refreshCookieFile()
     except Exception as err:
         printInfo(err)

示例#15

0

显示文件

文件： url_store.py 项目： yiyedata/simplified-scrapy

 def saveUrl(self, urls,i=None):
   # if (type(urls).__name__=='dict'):
   #   urls=urls["Urls"]
   self._lock.acquire()
   try:
     flag=False
     for url in urls:
       if(not isinstance(url,dict)):
         url={'url':url}
       if(md5(url['url']) not in self._dic):
         self._urls.append(url)
         self._dic.add(md5)
         self._writeFile(url,md5)
         flag=True
     if(flag):
       self._flushFile()
   except Exception as err:
     printInfo(err)
   finally:
     self._lock.release()

示例#16

0

显示文件

    def popUrl(self):
        db = self._connect()
        lst = []
        while (True):
            if (lst.count == 10): return None
            tbName = self._tbName
            i = random.randint(0, 9)
            printInfo('popUrl', i)
            if (i in lst): continue
            lst.append(i)
            if (i):
                tbName = tbName + str(i)
            if (self._tbCache.get(tbName)): continue

            url = db[tbName].find_one({"state": 0})
            if (url):
                db[tbName].update({"_id": url["_id"]}, {"$set": {"state": 1}})
                if (i in self._totalCount): self._totalCount[i] -= 1
            else:
                self._tbCache[tbName] = True
            return url

示例#17

0

显示文件

文件： sqlite_htmlstore.py 项目： yiyedata/simplified-scrapy

 def __init__(self, name):
     try:
         self._htmlPath = self._htmlPath.format(name)
         if (not os.path.exists('db/')):
             os.mkdir('db/')
         if (not os.path.exists('htmls/')):
             os.mkdir('htmls/')
         if (not os.path.exists(self._htmlPath)):
             os.mkdir(self._htmlPath)
         self._dbPath = self._dbPath.format(name)
         conn = sqlite3.connect(self._dbPath)
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS htmls
         (id INTEGER PRIMARY KEY autoincrement,
         json  TEXT  NOT NULL,
         state INT NOT NULL DEFAULT 0,
         tm  TEXT);''')
         conn.commit()
         conn.close()
     except Exception as err:
         printInfo(err)

示例#18

0

显示文件

 def __init__(self):
     try:
         self._iniModels('models/')
     except Exception as err:
         printInfo(err)

示例#19

0

显示文件

文件： xml_helper.py 项目： yiyedata/simplified-scrapy

def convert2Dic(html):
    try:
        start = html.find('<')
        end = html.find('>')
        html = html[start + 1:end].strip('/').strip()
        html = re.sub('(\\s|&nbsp;)+', ' ', html, 0)
        html = re.sub('(\')+', '"', html, 0)
        html = re.sub('(=\s*")+', '="', html, 0)
        lstC = []  #list(html)
        N = len(html)
        i = 0
        first = False
        flag = False
        while i < N:
            if html[i] == '"':
                lstC.append(html[i])
                first = not first
            elif not first and html[i] == '=' and html[i + 1] != '"':
                lstC.append(html[i])
                lstC.append('"')
                flag = True
            elif not first and flag and html[i] == ' ':
                flag = False
                lstC.append('"')
                lstC.append(html[i])
            else:
                lstC.append(html[i])
            i += 1
        html = ''.join(lstC)
        paras = html.split('"')
        dic = Dict()
        lastP = None
        first = True
        for para in paras:
            if (first):
                first = False
                tmp = para.split()
                dic['tag'] = tmp[0]
                if (len(tmp) > 1):
                    lastP = tmp[1].strip().strip('=').strip()
                continue
            if (lastP):
                if (not dic[lastP]):
                    dic[lastP] = para
                else:
                    dic[lastP] += ' '
                    dic[lastP] += para
                lastP = None
            elif para:
                if (para.find('=') > 0):
                    lastP = para.strip().strip('=').strip()
                else:
                    dic[para] = ''
        return dic
    except Exception as err:
        printInfo(err)
        try:
            tag = ''
            if (html.find('</') < 0 and html.find('/>') < 0):
                start = html.find('<')
                end = html.find(' ', start + 1)
                tag = '</' + html[start + 1:end] + '>'
            tree = ET.XML(html + tag)
            return XmlDictConfig(tree)
        except Exception as err:
            printInfo(err)
    return None

示例#20

0

显示文件

文件： url_store.py 项目： yiyedata/simplified-scrapy

 def __del__(self):
   self._urlfile.close()
   self._dicfile.close()
   self._indexfile.close()
   printInfo('__del__')

示例#21

0

显示文件

文件： request_helper.py 项目： ra2003/simplified-scrapy

def log(err, data):
    printInfo(err, data)
    logger = logging.getLogger()
    logging.LoggerAdapter(logger, None).log(logging.ERROR, err)
    logging.LoggerAdapter(logger, None).log(logging.ERROR, data)

示例#22

0

显示文件

 def downloadError(self, url, err=None):
     printInfo('error url:', url, err)
     self.url_store.updateState(url, 2)

示例#23

0

显示文件

 def renderUrl(self, url, callback):
     printInfo('Need to implement method "renderUrl"')

示例#24

0

显示文件

 def customDown(self, url):
     printInfo('Need to implement method "customDown"')

示例#25

0

显示文件

 def log(self, msg, level=logging.ERROR):
     printInfo(msg)
     logger = logging.getLogger()
     logging.LoggerAdapter(logger, None).log(level, msg)

示例#26

0

显示文件

 def downloadRender(self, url, ssp):
     printInfo(url['url'])
     self._downloadPageNum += 1
     ssp.renderUrl(url, self.down_callback)