def __init__(self, name, setting={}):
     if setting:
         if setting.get('port'):
             self._port = setting.get('port')
     if (not os.path.exists('db/')):
         os.mkdir('db/')
     self._name = name
     self._dbPath = self._dbPath.format(name)
     conn = None
     try:
         conn = sqlite3.connect(self._dbPath)
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS {}
         (id INTEGER PRIMARY KEY AUTOINCREMENT,
         name VARCHAR(50) NOT NULL,
         method VARCHAR(10) NOT NULL,
         url TEXT NOT NULL,
         tmSpan REAL NOT NULL,
         concurrency INT NOT NULL DEFAULT 0,
         countPer10s INT NOT NULL DEFAULT 0,
         state INT NOT NULL DEFAULT 0,
         size INT NOT NULL DEFAULT 0,
         tm CHAR(20));'''.format(self._tbName))
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     if (conn): conn.close()
예제 #2
0
 def downloadThread2(self, *args):
     url = args[0]
     ssp = args[1]
     try:
         self._concurrency -= 1
         printInfo(url['url'])
         self._downloadPageNum += 1
         startTm = time.time()
         url['_startTm'] = startTm
         html = execDownload(url, ssp)
         url['_endTm'] = time.time()
         tmSpan = url['_endTm'] - startTm
         state = 1
         if (html == "_error_"):
             state = 0
             ssp.downloadError(url)
         else:
             data = ssp.saveHtml(url, html)
             if data and (isinstance(data, dict) or isinstance(data, list)):
                 self._extractQueue.put((ssp, data))
         if self.statistics:
             curIndex = int(time.time() / 10) % 2
             self.statistics.addRecode(
                 ssp, url, tmSpan, state, self._concurrency,
                 self._downloadPagePer10s[1 - curIndex],
                 len(html) if html and state else 0)
     except Exception as err:
         self.log(err, logging.ERROR)
예제 #3
0
 def __init__(self, name):
   try:
     self._urlFilename=self._urlFilename.format(name)
     self._dicFilename=self._dicFilename.format(name)
     self._indexFilename=self._indexFilename.format(name)
     if(not os.path.exists('db/')):
       os.mkdir('db/')
     self._urlfile = io.open(self._urlFilename, "a+",encoding="utf-8")
     self._dicfile = io.open(self._dicFilename, "a+",encoding="utf-8")
     self._indexfile = io.open(self._indexFilename, "a+",encoding="utf-8")
     self._urlfile.seek(0)
     self._dicfile.seek(0)
     self._indexfile.seek(0)
     index = self._indexfile.read()
     if(index):
       self._i = int(index)
     line = 'start'
     while(line):
       line = self._dicfile.readline()
       if(line):
         self._dic.add(line[:-1])
     i = 0
     line = 'start'
     while(line):
       line = self._urlfile.readline()
       if(i<self._i):
         i=i+1
         continue
       if(line):
         self._urls.append(json.loads(line[:-1]))
   except Exception as err:
     printInfo(err)
예제 #4
0
 def log(self, msg, level=logging.DEBUG):
     if (isinstance(msg, UnicodeEncodeError)):
         printInfo('UnicodeEncodeError', msg)
         return
     printInfo(msg)
     if (level == logging.ERROR):
         logger = logging.getLogger()
         logging.LoggerAdapter(logger, None).log(level, msg)
 def clearRecode(self):
     conn = sqlite3.connect(self._dbPath)
     try:
         cursor = conn.cursor()
         cursor.execute("delete from request_tm")
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     conn.close()
예제 #6
0
def listImg(html, baseUrl=None, start=None, end=None, before=None):
    if (not html or html.find("<img") < 0): return []
    section = getSection(html, start, end, before)
    s = section[0]
    e = section[1]

    if (s < 0 or e < s): return None
    html = html[s:e]
    if (not html or html.find("<img") < 0): return None

    patternLst = _getRegex(u'<img[\s]+[^>]*>')
    patternUrl = _getRegex(u'src[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]')
    patternTitle = _getRegex(u'alt[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
    lstStr = patternLst.findall(html)
    # lst=[]
    dic = Dict()
    for i in lstStr:
        url = None
        title = None
        tmp = patternUrl.search(i)
        if tmp:
            url = tmp.group("url")

        tmp = patternTitle.search(i)
        if tmp:
            title = tmp.group("title")
        try:
            if (url):
                url = url.strip().lower()
                if (baseUrl and url[:7] != "http://"
                        and url[:8] != "https://"):
                    absUrl = absoluteUrl(baseUrl, url)
                    if (absUrl != url):
                        d = dic[absUrl]
                        if d:
                            if not d.alt or (title
                                             and len(d.alt) < len(title)):
                                d['alt'] = title
                        else:
                            dic[absUrl] = Dict({
                                'url': absUrl,
                                'alt': title,
                                'relativeUrl': url
                            })
                        # lst.append(Dict({'url':absUrl,'alt':title, 'relativeUrl':url}))
                else:
                    d = dic[url]
                    if d:
                        if not d.alt or (title and len(d.alt) < len(title)):
                            d['alt'] = title
                    else:
                        dic[url] = Dict({'url': url, 'alt': title})
                    # lst.append(Dict({'url':url,'alt':title}))
        except Exception as ex:
            printInfo(ex)

    return list(dic.values())
예제 #7
0
def listA(html,baseUrl=None,start=None,end=None,before=None):
  if(not html): return []
  section = getSection(html,start,end,before)
  s = section[0]
  e = section[1]
  if(s < 0 or e < s): return []
  html = html[s:e]
  if(not html or html.find("<a")<0): return []

  patternLst = _getRegex(u'<a[\s]+[^>]*>[\s\S]*?</a>')
  patternUrl = _getRegex(u'href[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') 
  patternTitle1 = _getRegex(u'title[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
  patternTitle2 = _getRegex(u'<a[\s]+[^>]*>(?P<title>.*?)</a>')

  strA = patternLst.findall(html)
  dic = Dict()
  for i in strA:
    url = None
    title = None
    tmp = patternUrl.search(i)
    if tmp: url = tmp.group("url")
      
    tmp = patternTitle1.search(i)
    if tmp: title = tmp.group("title")
    if(not title):
      tmp = patternTitle2.search(i)
      if tmp: 
        title = tmp.group("title")
        title = _getRegex('<[^<>]+>').sub('',title)

    try:
      if(url):
        url = url.strip().lower()
        if(baseUrl and url[:7]!="http://" and  url[:8]!="https://"):
          absUrl = absoluteUrl(baseUrl,url)
          if(absUrl!=url):
            d = dic[absUrl]
            if d:
              if not d.title or (title and len(d.title)<len(title)):
                d['title'] = title
            else:
              dic[absUrl] = Dict({'url':absUrl,'title':title,'relativeUrl':url})
        else:
          if url.rfind('/')<9:
            url += '/'
          d = dic[url]
          if d:
            if not d.title or (title and len(d.title)<len(title)):
              d['title'] = title
          else:
            dic[url] = Dict({'url':url,'title':title})
    except Exception as ex:
      printInfo(ex)
    
  return list(dic.values())
 def _insertRT(self, datas):
     conn = sqlite3.connect(self._dbPath)
     try:
         cursor = conn.cursor()
         cursor.executemany(
             "insert into request_tm(name,method,url,tmSpan,concurrency,countPer10s,state,size,tm) values(?,?,?,?,?,?,?,?,?)",
             tuple(datas))
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     conn.close()
 def _select(self, sql):
     conn = sqlite3.connect(self._dbPath)
     rows = []
     header = []
     try:
         cursor = conn.cursor().execute(sql)
         for i in cursor.description:
             header.append(i[0])
         for row in cursor:
             rows.append(row)
     except Exception as err:
         printInfo('error', err)
     conn.close()
     return (rows, header)
예제 #10
0
 def _dealRequestTmThread(self):
     while (self._runflag):
         try:
             datas = []
             i = 0
             while (True):
                 i += 1
                 if self._requestTmq.empty():
                     break
                 data = self._requestTmq.get_nowait()
                 datas.append(data)
                 if i > 1000: break
             if datas: self._insertRT(datas)
         except Exception as err:
             printInfo('error', err)
         time.sleep(1)
예제 #11
0
    def extract(self, url, html, ssp):
        mds = url.get("model")
        if (not mds):
            mds = ssp.models
        models = []
        if (mds):
            for modelName in mds:
                m = ExtractModel.get(modelName)
                if (m):
                    models.append(m)
                else:
                    printInfo('no model ' + modelName)

        return ssp.extract(Dict(url), html, models, mds)


# print (ExtractModel.auto_all)
예제 #12
0
 def _iniModels(self, rootdir):
     try:
         if (not os.path.isdir(rootdir)):
             return
         list = os.listdir(rootdir)
         for i in range(0, len(list)):
             name = list[i]
             path = os.path.join(rootdir, name)
             if os.path.isfile(path):
                 f = open(path, 'r')
                 if sys.version_info.major == 2:
                     ExtractModel[name[:-5]] = json.loads(
                         f.read().decode('utf-8'))
                 else:
                     ExtractModel[name[:-5]] = json.loads(f.read())
                 f.close()
     except Exception as err:
         printInfo(err)
예제 #13
0
 def resetUrls(self, urls):
   self._lock.acquire()
   try:
     flag=False
     for url in urls:
       if(not isinstance(url,dict)):
         url={'url':url}
       id=md5(url['url'])
       self._urls.append(url)
       self._dic.add(id)
       self._writeFile(url,id)
       flag=True
     if(flag):
       self._flushFile()
   except Exception as err:
     printInfo(err)
   finally:
     self._lock.release()
예제 #14
0
 def __init__(self):
     try:
         if (not os.path.exists('db/')):
             os.mkdir('db/')
         self._cookiefile = io.open(self._cookieFilename,
                                    "a+",
                                    encoding="utf-8")
         self._cookiefile.seek(0)
         line = 'start'
         while (line):
             line = self._cookiefile.readline()
             if (line):
                 line = line[:-1]
                 start = line.index(',')
                 if (start > 0):
                     self._cookies[line[0:start]] = line[start + 1:]
         self._refreshCookieFile()
     except Exception as err:
         printInfo(err)
예제 #15
0
 def saveUrl(self, urls,i=None):
   # if (type(urls).__name__=='dict'):
   #   urls=urls["Urls"]
   self._lock.acquire()
   try:
     flag=False
     for url in urls:
       if(not isinstance(url,dict)):
         url={'url':url}
       if(md5(url['url']) not in self._dic):
         self._urls.append(url)
         self._dic.add(md5)
         self._writeFile(url,md5)
         flag=True
     if(flag):
       self._flushFile()
   except Exception as err:
     printInfo(err)
   finally:
     self._lock.release()
예제 #16
0
    def popUrl(self):
        db = self._connect()
        lst = []
        while (True):
            if (lst.count == 10): return None
            tbName = self._tbName
            i = random.randint(0, 9)
            printInfo('popUrl', i)
            if (i in lst): continue
            lst.append(i)
            if (i):
                tbName = tbName + str(i)
            if (self._tbCache.get(tbName)): continue

            url = db[tbName].find_one({"state": 0})
            if (url):
                db[tbName].update({"_id": url["_id"]}, {"$set": {"state": 1}})
                if (i in self._totalCount): self._totalCount[i] -= 1
            else:
                self._tbCache[tbName] = True
            return url
예제 #17
0
 def __init__(self, name):
     try:
         self._htmlPath = self._htmlPath.format(name)
         if (not os.path.exists('db/')):
             os.mkdir('db/')
         if (not os.path.exists('htmls/')):
             os.mkdir('htmls/')
         if (not os.path.exists(self._htmlPath)):
             os.mkdir(self._htmlPath)
         self._dbPath = self._dbPath.format(name)
         conn = sqlite3.connect(self._dbPath)
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS htmls
         (id INTEGER PRIMARY KEY autoincrement,
         json  TEXT  NOT NULL,
         state INT NOT NULL DEFAULT 0,
         tm  TEXT);''')
         conn.commit()
         conn.close()
     except Exception as err:
         printInfo(err)
예제 #18
0
 def __init__(self):
     try:
         self._iniModels('models/')
     except Exception as err:
         printInfo(err)
예제 #19
0
def convert2Dic(html):
    try:
        start = html.find('<')
        end = html.find('>')
        html = html[start + 1:end].strip('/').strip()
        html = re.sub('(\\s|&nbsp;)+', ' ', html, 0)
        html = re.sub('(\')+', '"', html, 0)
        html = re.sub('(=\s*")+', '="', html, 0)
        lstC = []  #list(html)
        N = len(html)
        i = 0
        first = False
        flag = False
        while i < N:
            if html[i] == '"':
                lstC.append(html[i])
                first = not first
            elif not first and html[i] == '=' and html[i + 1] != '"':
                lstC.append(html[i])
                lstC.append('"')
                flag = True
            elif not first and flag and html[i] == ' ':
                flag = False
                lstC.append('"')
                lstC.append(html[i])
            else:
                lstC.append(html[i])
            i += 1
        html = ''.join(lstC)
        paras = html.split('"')
        dic = Dict()
        lastP = None
        first = True
        for para in paras:
            if (first):
                first = False
                tmp = para.split()
                dic['tag'] = tmp[0]
                if (len(tmp) > 1):
                    lastP = tmp[1].strip().strip('=').strip()
                continue
            if (lastP):
                if (not dic[lastP]):
                    dic[lastP] = para
                else:
                    dic[lastP] += ' '
                    dic[lastP] += para
                lastP = None
            elif para:
                if (para.find('=') > 0):
                    lastP = para.strip().strip('=').strip()
                else:
                    dic[para] = ''
        return dic
    except Exception as err:
        printInfo(err)
        try:
            tag = ''
            if (html.find('</') < 0 and html.find('/>') < 0):
                start = html.find('<')
                end = html.find(' ', start + 1)
                tag = '</' + html[start + 1:end] + '>'
            tree = ET.XML(html + tag)
            return XmlDictConfig(tree)
        except Exception as err:
            printInfo(err)
    return None
예제 #20
0
 def __del__(self):
   self._urlfile.close()
   self._dicfile.close()
   self._indexfile.close()
   printInfo('__del__')
예제 #21
0
def log(err, data):
    printInfo(err, data)
    logger = logging.getLogger()
    logging.LoggerAdapter(logger, None).log(logging.ERROR, err)
    logging.LoggerAdapter(logger, None).log(logging.ERROR, data)
예제 #22
0
 def downloadError(self, url, err=None):
     printInfo('error url:', url, err)
     self.url_store.updateState(url, 2)
예제 #23
0
 def renderUrl(self, url, callback):
     printInfo('Need to implement method "renderUrl"')
예제 #24
0
 def customDown(self, url):
     printInfo('Need to implement method "customDown"')
예제 #25
0
 def log(self, msg, level=logging.ERROR):
     printInfo(msg)
     logger = logging.getLogger()
     logging.LoggerAdapter(logger, None).log(level, msg)
예제 #26
0
 def downloadRender(self, url, ssp):
     printInfo(url['url'])
     self._downloadPageNum += 1
     ssp.renderUrl(url, self.down_callback)