Пример #1
0
 def setHeaders(self, headers):
     if not isinstance(headers, dict):
         Logger.error(
             'PhantomJsResponse setHeaders() error: headers is not a dict...'
         )
         return
     self.headers = headers
Пример #2
0
 def saveData(self):
     if configs.parse.file:
         try:
             dataList = []
             try:
                 if os.path.exists(configs.parse.contentpath + configs.parse.contentfile):
                     dataList = readLinesFile(configs.parse.contentpath + configs.parse.contentfile)
                     for i in range(len(dataList)):
                         try:
                             self.addDataItem(json.loads(dataList[i]), self.primaryKey)
                         except Exception:
                             pass
             except Exception:
                 pass
             dataList = []
             for item in self.dataList:
                 dataList.append(json.dumps(item, ensure_ascii = False))
             writeLinesFile(configs.parse.contentpath + configs.parse.contentfile, dataList, method=configs.parse.savemethod)
         except Exception as e:
             Logger.error('an error occured while saving data to file...', e)
     if configs.parse.mysql:
         from espider.mysql import Mysql
         keyList = []
         for k in self.dataList[0]:
             keyList.append(k)
         mySql = Mysql(configs.mysql.table, keyList, primaryKey=self.primaryKey)
         mySql.insertWithUpdate(self.dataList)
Пример #3
0
 def setParser(self, parser):
     if configs.spider.mode == 'override':
         Logger.warning('Spider mode is override in configs. setParse() will be ignored. If you want to use update mode, change it in config_override')
         return
     if not isinstance(parser, BaseParser):
         Logger.error('setParser() should have a BaseParser-like object input. Spider will scribe in override instead of update mode')
         return
     self.parser = parser
Пример #4
0
 def getReqWithSel(self, request):
     if not isinstance(request, urllib.request.Request):
         Logger.error('SelePhan request error: please make sure request is a urllib.request.Request object...')
         return None
     url = request.full_url
     self.driver.get(url)
     response = PhantomJsResponse(self.driver.page_source, {'Content-Type':'text/html'})
     return response
Пример #5
0
 def loadProxy(self):
     data = readLinesFile(configs.proxy.srcname)
     if data == None:
         Logger.critical('cannot load proxy list, espider is shuting down...')
         exit(1)
     proxyList = []
     for i in range(len(data)):
         proxyList.append(dict(zip(('type', 'ip', 'port', 'available', 'ping'), data[i].split('\t'))))
     return proxyList
Пример #6
0
 def catalogueUrlRecursion(self, url):
     if configs.spider.catalogueLimit != 'inf':
         if self.catalogueCount >= configs.spider.catalogueLimit:
             return
     url = urljoin(self.host, url)
     urllistContent = []
     urllistCatalogue = []
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning(
                 'cannot get url %s. please check httphandler...' % url)
             return
         response = EsResponse(response)
         try:
             urllistCatalogue, urllistContent = self.getUrlList(response)
             break
         except ValueError:
             Logger.critical(
                 'please verify your getUrlList() return 2 lists. espider is shutting down...'
             )
             exit(1)
         except Exception as e:
             Logger.error(
                 'an error occured in getUrlList(). if this take place often, please check your code'
             )
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
     if (len(urllistContent) != 0):
         for item in urllistContent:
             self.contentCount = self.contentCount + 1
             if configs.spider.contentLimit != 'inf':
                 if self.contentCount > configs.spider.contentLimit:
                     break
             if not keyValueInDictList('contentUrl', item,
                                       self.contentDictList):
                 Logger.debug('discover content url %s' % item)
                 dictTemp = {}
                 dictTemp['contentUrl'] = item
                 self.contentDictList.append(dictTemp)
     if len(urllistCatalogue) == 0:
         return
     else:
         for item in urllistCatalogue:
             if not item in self.catalogueUrl:
                 if configs.spider.catalogueLimit != 'inf':
                     if self.catalogueCount >= configs.spider.catalogueLimit:
                         return
                 Logger.info('get catalogue url %s' % item)
                 self.catalogueUrl.add(item)
                 self.catalogueCount = self.catalogueCount + 1
                 time.sleep(random.random() * configs.http.sleeptime)
                 self.catalogueUrlRecursion(item)
         return
Пример #7
0
 def backupUpdate(self):
     if not os.path.exists(configs.spider.contentfilename):
         return
     if not os.path.exists(configs.spider.contentbackuppath):
         os.makedirs(configs.spider.contentbackuppath)
     now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')
     dstfilename = os.path.join(configs.spider.contentbackuppath, now + os.path.split(configs.spider.contentupdatefilename)[1])
     try:
         shutil.copy2(configs.spider.contentupdatefilename, dstfilename)
     except IOError:
         Logger.error('Cannot copy file to update path...')
Пример #8
0
 def checkKeyList(self, keyList):
     flag = True
     if len(self.keyList) != len(keyList):
         Logger.error('keyList length do not match...')
         return False
     for item in keyList:
         if item not in self.keyList:
             Logger.error('keyList element do not match')
             flag = False
             break
     return flag
Пример #9
0
 def getReqWithSel(self, request):
     if not isinstance(request, urllib.request.Request):
         Logger.error(
             'SelePhan request error: please make sure request is a urllib.request.Request object...'
         )
         return None
     url = request.full_url
     self.driver.get(url)
     response = PhantomJsResponse(self.driver.page_source,
                                  {'Content-Type': 'text/html'})
     return response
Пример #10
0
 def checkKeyList(self, keyList):
     flag = True
     if len(self.keyList) != len(keyList):
         Logger.error('keyList length do not match...')
         return False
     for item in keyList:
         if item not in self.keyList:
             Logger.error('keyList element do not match')
             flag = False
             break
     return flag
Пример #11
0
 def setParser(self, parser):
     if configs.spider.mode == 'override':
         Logger.warning(
             'Spider mode is override in configs. setParse() will be ignored. If you want to use update mode, change it in config_override'
         )
         return
     if not isinstance(parser, BaseParser):
         Logger.error(
             'setParser() should have a BaseParser-like object input. Spider will scribe in override instead of update mode'
         )
         return
     self.parser = parser
Пример #12
0
 def __init__(self, contentType, primaryKey = None, contentPath = configs.spider.contentdatapath, openMethod = 'rb', openEncoding = None):
     if self.parserName == '':
         Logger.warning('You should define parserName for your parser! Espider is shutting down...')
         exit(1)
     self.contentType = contentType
     self.contentPath = contentPath
     self.openMethod = openMethod
     self.openEncoding = openEncoding
     self.dataList = []
     self.primaryValue = []
     self.primaryKey = primaryKey
     self.contentPath = contentPath
Пример #13
0
 def __init__(self, response):
     self.data = b''
     self.headers = []
     self.code = ''
     if response == None:
         return
     if not isinstance(response, HTTPResponse):
         Logger.error('EsRequest error: wrong type of response')
         return
     self.data = response.read()
     self.headers = response.getheaders()
     self.code = response.getcode()
     self.url = response.geturl()
Пример #14
0
 def loadProxy(self):
     data = readLinesFile(configs.proxy.srcname)
     if data == None:
         Logger.critical(
             'cannot load proxy list, espider is shuting down...')
         exit(1)
     proxyList = []
     for i in range(len(data)):
         proxyList.append(
             dict(
                 zip(('type', 'ip', 'port', 'available', 'ping'),
                     data[i].split('\t'))))
     return proxyList
Пример #15
0
 def backupUpdate(self):
     if not os.path.exists(configs.spider.contentfilename):
         return
     if not os.path.exists(configs.spider.contentbackuppath):
         os.makedirs(configs.spider.contentbackuppath)
     now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')
     dstfilename = os.path.join(
         configs.spider.contentbackuppath,
         now + os.path.split(configs.spider.contentupdatefilename)[1])
     try:
         shutil.copy2(configs.spider.contentupdatefilename, dstfilename)
     except IOError:
         Logger.error('Cannot copy file to update path...')
Пример #16
0
 def catalogueUrlRecursion(self, url):
     if configs.spider.catalogueLimit != 'inf':
         if self.catalogueCount >= configs.spider.catalogueLimit:
             return
     url = urljoin(self.host, url)
     urllistContent = []
     urllistCatalogue = []
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning('cannot get url %s. please check httphandler...' % url)
             return
         response = EsResponse(response)
         try:
             urllistCatalogue, urllistContent = self.getUrlList(response)               
             break
         except ValueError:
             Logger.critical('please verify your getUrlList() return 2 lists. espider is shutting down...')
             exit(1)
         except Exception as e:
             Logger.error('an error occured in getUrlList(). if this take place often, please check your code')
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
     if(len(urllistContent) != 0):
         for item in urllistContent:
             self.contentCount = self.contentCount + 1
             if configs.spider.contentLimit != 'inf':
                 if self.contentCount > configs.spider.contentLimit:
                     break
             if not keyValueInDictList('contentUrl', item, self.contentDictList):
                 Logger.debug('discover content url %s' % item)
                 dictTemp = {}
                 dictTemp['contentUrl'] = item
                 self.contentDictList.append(dictTemp)
     if len(urllistCatalogue) == 0:
         return
     else:
         for item in urllistCatalogue:
             if not item in self.catalogueUrl:
                 if configs.spider.catalogueLimit != 'inf':
                     if self.catalogueCount >= configs.spider.catalogueLimit:
                         return
                 Logger.info('get catalogue url %s' % item)
                 self.catalogueUrl.add(item)
                 self.catalogueCount = self.catalogueCount + 1
                 time.sleep(random.random() * configs.http.sleeptime)
                 self.catalogueUrlRecursion(item)
         return
Пример #17
0
 def loadContentUpdateFileList(self):
     if not os.path.exists(configs.spider.contentupdatefilename):
         return []
     dataList = readLinesFile(configs.spider.contentupdatefilename)
     fileList = []
     try:
         for item in dataList:
             if item.startswith('#'):
                 continue
             data = item.split('\t')
             fileList.append(data[3])
     except IndexError:
         Logger.error('Loading contentupdatefile error!')
     return fileList
Пример #18
0
    def startEspider(self):
        if configs.spider.mode != 'override' and configs.spider.mode != 'update':
            Logger.error(
                'Please verify spider.mode is override or update in configs. Spider will run in default mode(override)'
            )
        if configs.spider.mode == 'update' and self.parser == None:
            Logger.error(
                'Spider cannot run in update mode without a correct function setParser() defined. '
            )
        Logger.info('Espider running in %s mode' %
                    ('override' if self.parser == None else 'update'))
        if self.parser != None:
            # update mode
            self.backupUpdate()
            self.contentDictList = self.loadContentDictList()

        Logger.info('start to get catalogue urls...')

        if configs.spider.loadurllistfromfile:
            self.loadCatalogueList()
            self.contentDictList = self.loadContentDictList()
        else:
            self.catalogueUrlRecursion(self.startUrl)
            writeLinesFile(configs.spider.cataloguefilename,
                           self.catalogueUrl,
                           method='w+')
        count = 0

        for item in self.contentDictList:
            count = count + 1
            MD5, filepath = self.contentHandler(item['contentUrl'], count)
            item['filepath'] = filepath
            if 'MD5' in item:
                if self.parser == None:
                    item['update'] = 'disabled'
                elif item['MD5'] == MD5:
                    item['update'] = 'false'
                else:
                    item['update'] = 'true'
                item['MD5'] = MD5
            else:
                if self.parser == None:
                    item['update'] = 'disabled'
                else:
                    item['update'] = 'true'
                item['MD5'] = MD5
        self.saveContentUrlDictList()
        self.saveContentUrlUpdate()
        Logger.info('espider complete the task!')
Пример #19
0
def readLinesFile(filename, method = 'r'):
    """
        Read from a file and extract each line to the element of a list.
    """
    dataInLine = []
    try:
        with open(filename, method, encoding='utf8') as f:
            data = f.readlines()
            for i in range(len(data)):
                data[i] = data[i].strip()
            dataInLine = data
    except OSError:
        Logger.error('an errer occured when open %s' %filename)
        return None
    return dataInLine
Пример #20
0
 def __sqlExecute(self,sql):
     data = None
     try:
         connection = pymysql.connect(host = configs.mysql.host, port = configs.mysql.port, user = configs.mysql.user, password = configs.mysql.password, db = configs.mysql.db, charset = 'utf8')
         try:
             with connection.cursor() as cur:
                 cur.execute(sql)
                 data = cur.fetchall()
                 connection.commit()
         except Exception:
             Logger.error('sql statement execute error:%s' %sql)
         finally:
             connection.close()
     except Exception:
         Logger.error('mysql database open with error...')
     return data
Пример #21
0
def writeLinesFile(filename, dataInLine, method = 'w'):
    """
        Write a list to the file. One element to one line.
    """
    if not isinstance(dataInLine, Iterable):
        Logger.error('input illegal')
        return
    if not os.path.exists(os.path.split(filename)[0]):
        os.makedirs(os.path.split(filename)[0])
    dataInLine = [str(line) + '\n' for line in dataInLine]
    try:
        with open(filename, method, encoding = 'utf8') as f:
            f.writelines(dataInLine)
    except OSError:
        Logger.error('an errer occured when open %s' %filename)
    return
Пример #22
0
 def createTable(self):
     if not isinstance(self.keyList, list):
         Logger.error('key list error when creating table %s' %self.table)
         return
     if len(self.keyList) == 0:
         Logger.error('key list without an element in it, cannot create table %s' %self.table)
         return
     keyList = list()
     for i in range(len(self.keyList)):
         keyList.append(self.keyList[i] + ' VARCHAR(255)')
     if self.primaryKey != None:
         if self.primaryKey == self.keyList[0]:
             keyList[0] = keyList[0] + ' PRIMARY KEY'
     temp = ','.join(keyList)
     sql = "CREATE TABLE IF NOT EXISTS %s(%s)" %(self.table, temp)
     self.__sqlExecute(sql)
     return
Пример #23
0
 def createTable(self):
     if not isinstance(self.keyList, list):
         Logger.error('key list error when creating table %s' % self.table)
         return
     if len(self.keyList) == 0:
         Logger.error(
             'key list without an element in it, cannot create table %s' %
             self.table)
         return
     keyList = list()
     for i in range(len(self.keyList)):
         keyList.append(self.keyList[i] + ' VARCHAR(255)')
     if self.primaryKey != None:
         if self.primaryKey == self.keyList[0]:
             keyList[0] = keyList[0] + ' PRIMARY KEY'
     temp = ','.join(keyList)
     sql = "CREATE TABLE IF NOT EXISTS %s(%s)" % (self.table, temp)
     self.__sqlExecute(sql)
     return
Пример #24
0
 def loadContentDictList(self):
     if not os.path.exists(configs.spider.contentfilename):
         return []
     dataList = readLinesFile(configs.spider.contentfilename)
     dataDictList = []
     try:
         for item in dataList:
             if item.startswith('#'):
                 continue
             t = {}
             data = item.split('\t')
             t['contentUrl'] = data[0]
             t['MD5'] = data[1]
             t['update'] = data[2]
             t['filepath'] = data[3]
             dataDictList.append(t)
     except IndexError:
         Logger.error('Loading contentfile error!')
     return dataDictList
Пример #25
0
 def loadContentDictList(self):
     if not os.path.exists(configs.spider.contentfilename):
         return []
     dataList = readLinesFile(configs.spider.contentfilename)
     dataDictList = []
     try:
         for item in dataList:
             if item.startswith('#'):
                 continue
             t = {}
             data = item.split('\t')
             t['contentUrl'] = data[0]
             t['MD5'] = data[1]
             t['update'] = data[2]
             t['filepath'] = data[3]
             dataDictList.append(t)
     except IndexError:
         Logger.error('Loading contentfile error!')
     return dataDictList
Пример #26
0
    def startEspider(self):
        if configs.spider.mode != 'override' and configs.spider.mode != 'update':
            Logger.error('Please verify spider.mode is override or update in configs. Spider will run in default mode(override)')
        if configs.spider.mode == 'update' and self.parser == None:
            Logger.error('Spider cannot run in update mode without a correct function setParser() defined. ')
        Logger.info('Espider running in %s mode' %('override' if self.parser == None else 'update'))
        if self.parser != None:
            # update mode
            self.backupUpdate()
            self.contentDictList = self.loadContentDictList()

        Logger.info('start to get catalogue urls...')

        if configs.spider.loadurllistfromfile:
            self.loadCatalogueList()
            self.contentDictList = self.loadContentDictList()
        else:
            self.catalogueUrlRecursion(self.startUrl)
            writeLinesFile(configs.spider.cataloguefilename, self.catalogueUrl, method='w+')
        count = 0

        for item in self.contentDictList:
            count = count + 1
            MD5, filepath = self.contentHandler(item['contentUrl'], count)
            item['filepath'] = filepath
            if 'MD5' in item:
                if self.parser == None:
                    item['update'] = 'disabled'
                elif item['MD5'] == MD5:
                    item['update'] = 'false'
                else:
                    item['update'] = 'true'
                item['MD5'] = MD5
            else:
                if self.parser == None:
                    item['update'] = 'disabled'
                else:
                    item['update'] = 'true'
                item['MD5'] = MD5
        self.saveContentUrlDictList()
        self.saveContentUrlUpdate()
        Logger.info('espider complete the task!')
Пример #27
0
 def select(self,selectKeyList = None):
     temp = []
     if selectKeyList == None:
         temp = '*'
     else:
         if not isinstance(selectKeyList, list):
             Logger.error('selectKeyList error because it is not a list...')
             return None
         temp = ','.join(selectKeyList)
     sql = 'SELECT %s FROM %s' %(temp, self.table)
     result = list(self.__sqlExecute(sql))
     if selectKeyList == None:
         sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = '%s' and table_schema = '%s'" %(self.table, configs.mysql.db)
         selectKeyList = list(self.__sqlExecute(sql))
         for i in range(len(selectKeyList)):
             selectKeyList[i] = selectKeyList[i][0]
     ret = []
     for item in result:
         ret.append(dict(zip(selectKeyList, item)))
     return ret
Пример #28
0
 def formProxy(self, count):
     if len(self.proxyList) == 0:
         self.proxy = None
         return
     if count >= len(self.proxyList):
         Logger.error('SelePhan proxy form error:out of range in proxyList...')
         self.proxy = None
         return
     proxy = self.proxyList[count]
     ipport = proxy['ip'] + ':' + proxy['port']
     proxyDict = {'proxyType':ProxyType.MANUAL}
     if proxy['type'] == 'http':
         proxyDict['httpProxy'] = ipport
     elif proxy['type'] == 'socks':
         proxyDict['socksProxy'] = ipport
     else:
         self.proxy = None
         return
     self.proxy = seleProxy(proxyDict)
     return
Пример #29
0
 def select(self, selectKeyList=None):
     temp = []
     if selectKeyList == None:
         temp = '*'
     else:
         if not isinstance(selectKeyList, list):
             Logger.error('selectKeyList error because it is not a list...')
             return None
         temp = ','.join(selectKeyList)
     sql = 'SELECT %s FROM %s' % (temp, self.table)
     result = list(self.__sqlExecute(sql))
     if selectKeyList == None:
         sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = '%s' and table_schema = '%s'" % (
             self.table, configs.mysql.db)
         selectKeyList = list(self.__sqlExecute(sql))
         for i in range(len(selectKeyList)):
             selectKeyList[i] = selectKeyList[i][0]
     ret = []
     for item in result:
         ret.append(dict(zip(selectKeyList, item)))
     return ret
Пример #30
0
 def __sqlExecute(self, sql):
     data = None
     try:
         connection = pymysql.connect(host=configs.mysql.host,
                                      port=configs.mysql.port,
                                      user=configs.mysql.user,
                                      password=configs.mysql.password,
                                      db=configs.mysql.db,
                                      charset='utf8')
         try:
             with connection.cursor() as cur:
                 cur.execute(sql)
                 data = cur.fetchall()
                 connection.commit()
         except Exception:
             Logger.error('sql statement execute error:%s' % sql)
         finally:
             connection.close()
     except Exception:
         Logger.error('mysql database open with error...')
     return data
Пример #31
0
 def addDataItem(self, item, primaryKey):
     itemtemp = OrderedDict()
     for k,v in item.items():
         if isinstance(v, list):
             if len(v) == 0:
                 itemtemp[k] = ''
             else:
                 itemtemp[k] = v[0]
         else:
             itemtemp[k] = v
     if primaryKey != None and primaryKey in itemtemp:
         if itemtemp[primaryKey] not in self.primaryValue:
             if self.primaryKey == None:
                 self.primaryKey = primaryKey
             elif self.primaryKey != primaryKey:
                 Logger.critical('different primary key found in returned data. espider is shutting down...')
                 exit(1)
             self.primaryValue.append(itemtemp[primaryKey])
             self.dataList.append(itemtemp)
         return
     self.dataList.append(itemtemp)
Пример #32
0
 def formProxy(self, count):
     if len(self.proxyList) == 0:
         self.proxy = None
         return
     if count >= len(self.proxyList):
         Logger.error(
             'SelePhan proxy form error:out of range in proxyList...')
         self.proxy = None
         return
     proxy = self.proxyList[count]
     ipport = proxy['ip'] + ':' + proxy['port']
     proxyDict = {'proxyType': ProxyType.MANUAL}
     if proxy['type'] == 'http':
         proxyDict['httpProxy'] = ipport
     elif proxy['type'] == 'socks':
         proxyDict['socksProxy'] = ipport
     else:
         self.proxy = None
         return
     self.proxy = seleProxy(proxyDict)
     return
Пример #33
0
 def __init__(self, startUrl=None):
     self.startUrl = startUrl
     path, file = os.path.split(configs.proxy.srcname)
     if not os.path.exists(path):
         os.makedirs(path)
     if startUrl == None:
         self.startUrl = 'http://www.baidu.com'
     if configs.proxy.rescrab:
         Logger.info('rescrab proxylist...')
         self.getFreeProxy()
     if configs.proxy.retest:
         Logger.info('retest proxy list...')
         self.testProxy()
     if not os.path.exists(configs.proxy.srcname):
         self.loadDefaultProxy()
     else:
         self.proxyList = self.loadProxy()
     self.proxyList = list(
         filter(lambda x: abs(int(x['available'])) == 1, self.proxyList))
     self.proxyList = list(
         filter(lambda x: float(x['ping']) < 2, self.proxyList))
     if len(self.proxyList) == 0:
         Logger.critical(
             'There is no available proxy! espider is shuting down...')
         exit(1)
     self.proxyList.sort(
         key=lambda x: 1000 if float(x['ping']) == -1 else float(x['ping']))
     self.proxyCount = 0
Пример #34
0
 def testProxy(self):
     """
         Test the proxy connection performance with self.startUrl.
     """
     req = urllib.request.Request(self.startUrl)
     for k,v in configs.urlrequest.items():
         if isinstance(v, list):
             l = len(v)
             v = v[randint(0, len(v) - 1)]
         req.add_header(k,v)
     Logger.info('test proxy list in %s' % configs.proxy.srcname)
     data = readLinesFile(configs.proxy.srcname)
     time.clock()
     proxyList = []
     for i in range(len(data)):
         proxyList.append(dict(zip(('type', 'ip', 'port'), data[i].split('\t'))))
         openner = urllib.request.build_opener(urllib.request.ProxyHandler({proxyList[i]['type']:proxyList[i]['ip'] + ':' + proxyList[i]['port']}), urllib.request.ProxyBasicAuthHandler())
         try:
             begin = time.clock()
             openner.open(req, timeout=configs.proxy.timeout)
             ping = time.clock() - begin
             available = 1
             Logger.info('proxy %s is good...' %proxyList[i]['ip'])
         except Exception as e:
             Logger.info('proxy %s is not available...' %proxyList[i]['ip'])
             ping = -1
             available = 0
         proxyList[i]['available'] = available
         proxyList[i]['ping'] = ping
     dataset = []
     for i in range(len(proxyList)):
         dataset.append('%s\t%s\t%s\t%s\t%s' %(proxyList[i]['type'], proxyList[i]['ip'], proxyList[i]['port'], proxyList[i]['available'], proxyList[i]['ping']))
     writeLinesFile(configs.proxy.srcname, dataset)
     return
Пример #35
0
 def getFreeProxy(self):
     """
         Two different ways getting free proxy which can be configured in configs.
         You can also define your own way of getting.
     """
     Logger.info('get free proxy from the Internet...')
     proxyList = []
     if configs.proxy.proxysrc == 1:
         for i in range(configs.proxy.srcpage):
             Logger.info('get page %s...' %(i + 1))
             req = urllib.request.Request('http://www.kuaidaili.com/free/inha/%s/' %(i + 1))
             req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0')
             data = urllib.request.urlopen(req).read().decode('utf-8')
             proxy = re.findall('<tr>\s*?<td data-title="IP">(.*?)</td>\s*?<td data-title="PORT">(.*?)</td>\s*?<td data-title="匿名度">.*?</td>\s*?<td data-title="类型">(.*?)</td>', data)
             for item in proxy:
                 if  not [item[2].lower(), item[0], item[1]] in proxyList:
                     proxyList.append([item[2].lower(), item[0], item[1]])
     if configs.proxy.proxysrc == 2:
         for i in range(configs.proxy.srcpage):
             Logger.info('get page %s...' %(i + 1))
             req = urllib.request.Request('http://www.xicidaili.com/nn/%s' %(i + 1))
             req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0')
             data = urllib.request.urlopen(req).read().decode('utf-8')
             proxy = re.findall('<td class="country"><img [\s\S]*?<td>(.*?)</td>\s*?<td>(.*?)</td>\s*?<td>[\s\S]*?</td>\s*?<td class="country">.*?</td>\s*?<td>(.*)</td>', data)
             for item in proxy:
                 if  not [item[2].lower(), item[0], item[1]] in proxyList:
                     proxyList.append([item[2].lower(), item[0], item[1]])
     dataset = []
     for item in proxyList:
         dataset.append('%s\t%s\t%s\t-1\t-1' %(item[0], item[1], item[2]))
     writeLinesFile(configs.proxy.srcname, dataset)
Пример #36
0
 def checkUrlQuery(self):
     if not isinstance(self.queryList, list) or len(self.queryList) == 0:
         Logger.critical('Please define queryList as a non-empty list! Espider is shutting down...')
         exit(1)
     if not isinstance(self.parameterList, list) or len(self.parameterList) == 0:
         Logger.critical('Please define parameterList as a non-empth list! Espider is shutting down...')
         exit(1)
     if not isinstance(self.extraParameter, OrderedDict):
         Logger.critical('extraParameter should be OrderedDict! Espider is shutting down')
         exit(1)
     if len(self.queryList) != len(self.parameterList):
         Logger.critical('Different length of queryList and parameterList, please make sure they match each other. Espider is shutting down...')
         exit(1)
     self.level = len(self.queryList)
Пример #37
0
 def catalogueUrlRecursion(self, param, path, level):
     if not os.path.exists(path):
         os.makedirs(path)
     Logger.info('(level %s)start to scrab param:%s' % (level, param))
     if not isinstance(self.queryList[level - 1], list):
         self.queryList[level - 1] = [self.queryList[level - 1]]
     for query in self.queryList[level - 1]:
         url = self.buildUrl(query, param)
         url, headers = self.buildExtraHeaders(url)
         response = self.httpHandler.getResponseByUrl(url, headers=headers)
         data, type = self.contentResponseHandle(response)
         with open(path + 'data_query=' + query + '.' + type,
                   'w+',
                   encoding='utf8') as f:
             f.write(data)
         if level == self.level:
             return
         try:
             nextParamList = self.contentHandler(data)
         except Exception:
             Logger.error(
                 'an error occured in contentHandler(). If this take place often, please shut espider down...'
             )
             nextParamList = None
         if nextParamList == None or nextParamList == []:
             return
         if not isinstance(nextParamList, list):
             Logger.critical(
                 'contentHandler() should return a list. Espider is shutting down...'
             )
             exit(1)
         if not isinstance(nextParamList[0], dict):
             Logger.critical(
                 'contentHandler() should return list made by dict of each element. Espider is shutting down...'
             )
             exit(1)
         writeLinesFile(path + 'param_query=' + query + '.txt',
                        nextParamList)
         for nextParam in nextParamList:
             for k, v in nextParam.items():
                 if k in self.parameterList[level]:
                     nextParamDict = dict(param)
                     nextParamDict[k] = v
                     nextPath = path + k + '=' + v + '/'
                     time.sleep(random.random() * configs.http.sleeptime)
                     self.catalogueUrlRecursion(nextParamDict, nextPath,
                                                level + 1)
                 else:
                     pass
Пример #38
0
 def insertWithUpdate(self, insertList):
     if not isinstance(insertList, list):
         Logger.error('insert list error because it is not a list...')
     if len(insertList) == 0:
         return
     keyList = []
     for k in insertList[0]:
         keyList.append(k)
     if not self.checkKeyList(keyList):
         return
     temp1 = ','.join(self.keyList)
     for item in insertList:
         valueList = []
         for k in self.keyList:
             valueList.append("'" + item[k] + "'")
         temp2 = ','.join(valueList)
         keyValueList = []
         for i in range(1, len(self.keyList)):
             keyValueList.append(self.keyList[i] + '=' + "VALUES(" + self.keyList[i] + ")")
         temp3 = ','.join(keyValueList)
         sql = "INSERT INTO %s(%s)VALUES(%s)ON DUPLICATE KEY UPDATE %s" %(self.table, temp1, temp2, temp3)
         self.__sqlExecute(sql)
Пример #39
0
 def insertWithUpdate(self, insertList):
     if not isinstance(insertList, list):
         Logger.error('insert list error because it is not a list...')
     if len(insertList) == 0:
         return
     keyList = []
     for k in insertList[0]:
         keyList.append(k)
     if not self.checkKeyList(keyList):
         return
     temp1 = ','.join(self.keyList)
     for item in insertList:
         valueList = []
         for k in self.keyList:
             valueList.append("'" + item[k] + "'")
         temp2 = ','.join(valueList)
         keyValueList = []
         for i in range(1, len(self.keyList)):
             keyValueList.append(self.keyList[i] + '=' + "VALUES(" +
                                 self.keyList[i] + ")")
         temp3 = ','.join(keyValueList)
         sql = "INSERT INTO %s(%s)VALUES(%s)ON DUPLICATE KEY UPDATE %s" % (
             self.table, temp1, temp2, temp3)
         self.__sqlExecute(sql)
Пример #40
0
 def __init__(self):
     Logger.info('Espider %s initiating...' % self.espiderName)
     if self.startUrl == '':
         Logger.critical('Your espider should have a startUrl! Espider is shutting down...')
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical('Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...')
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(self.startUrl).hostname
     self.checkUrlQuery()
     self.httpHandler = HttpHandler(self.host)
Пример #41
0
 def catalogueUrlRecursion(self, param, path, level):
     if not os.path.exists(path):
         os.makedirs(path)
     Logger.info('(level %s)start to scrab param:%s' % (level, param))
     if not isinstance(self.queryList[level - 1], list):
         self.queryList[level - 1] = [self.queryList[level - 1]]
     for query in self.queryList[level - 1]:
         url = self.buildUrl(query, param)
         url, headers = self.buildExtraHeaders(url)
         response = self.httpHandler.getResponseByUrl(url, headers=headers)
         data, type = self.contentResponseHandle(response)
         with open(path + 'data_query=' + query + '.' + type, 'w+', encoding='utf8') as f:
             f.write(data)
         if level == self.level:
             return
         try:
             nextParamList = self.contentHandler(data)
         except Exception:
             Logger.error('an error occured in contentHandler(). If this take place often, please shut espider down...')
             nextParamList = None
         if nextParamList == None or nextParamList == []:
             return
         if not isinstance(nextParamList, list):
             Logger.critical('contentHandler() should return a list. Espider is shutting down...')
             exit(1)
         if not isinstance(nextParamList[0], dict):
             Logger.critical('contentHandler() should return list made by dict of each element. Espider is shutting down...')
             exit(1)
         writeLinesFile(path + 'param_query=' + query + '.txt', nextParamList)
         for nextParam in nextParamList:
             for k,v in nextParam.items():
                 if k in self.parameterList[level]:
                     nextParamDict = dict(param)
                     nextParamDict[k] = v
                     nextPath = path + k + '=' + v + '/'
                     time.sleep(random.random() * configs.http.sleeptime)
                     self.catalogueUrlRecursion(nextParamDict, nextPath, level + 1)
                 else:
                     pass
Пример #42
0
 def getResponseByUrl(self, url, headers={}):
     """
         url is the website you want.
         headers is the dict you add dynamicly apart from that in configs
     """
     begin = time.clock()
     if urlparse(url).hostname == None:
         Logger.error('url of request illegal! which is %s' % url)
         return None
     req = urllib.request.Request(url)
     for k, v in configs.urlrequest.items():
         if isinstance(v, list):
             l = len(v)
             v = v[randint(0, len(v) - 1)]
         req.add_header(k, v)
     for k, v in headers.items():
         req.add_header(k, v)
     flag = False
     for i in range(configs.http.retry):
         Logger.debug('%s attempt' % (i + 1))
         try:
             if self.selephan != None:
                 response = self.selephan.getReqWithSel(req)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             if self.proxy != None:
                 response = self.proxy.getReqWithProxy(
                     req, timeout=configs.proxy.timeout)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             response = urllib.request.urlopen(req,
                                               timeout=configs.http.timeout)
             flag = True
             break
         except Exception as e:
             continue
     end = time.clock()
     Logger.debug('HTTP request time: %ss' % (end - begin))
     if flag:
         return response
     else:
         return None
Пример #43
0
 def startEspider(self):
     Logger.info('starting espider...')
     paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt')
     if paramList == None:
         Logger.critical('You should create starting parameters in %s' % (configs.spider.contentdatapath + 'param.txt'))
         exit(1)
     for i in range(len(paramList)):
         paramList[i] = json.loads(paramList[i])
         for k,v in paramList[i].items():
             if k in self.parameterList[0]:
                 param = {}
                 param[k] = v
                 path = configs.spider.contentdatapath + k + '=' + v + '/'
                 self.catalogueUrlRecursion(param, path, 1)
             else:
                 Logger.error('param.txt gives an incorrect key compared to self.paramterList...')
Пример #44
0
 def checkUrlQuery(self):
     if not isinstance(self.queryList, list) or len(self.queryList) == 0:
         Logger.critical(
             'Please define queryList as a non-empty list! Espider is shutting down...'
         )
         exit(1)
     if not isinstance(self.parameterList, list) or len(
             self.parameterList) == 0:
         Logger.critical(
             'Please define parameterList as a non-empth list! Espider is shutting down...'
         )
         exit(1)
     if not isinstance(self.extraParameter, OrderedDict):
         Logger.critical(
             'extraParameter should be OrderedDict! Espider is shutting down'
         )
         exit(1)
     if len(self.queryList) != len(self.parameterList):
         Logger.critical(
             'Different length of queryList and parameterList, please make sure they match each other. Espider is shutting down...'
         )
         exit(1)
     self.level = len(self.queryList)
Пример #45
0
 def __init__(self):
     Logger.info('Espider %s initiating...' % self.espiderName)
     if self.startUrl == '':
         Logger.critical(
             'Your espider should have a startUrl! Espider is shutting down...'
         )
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical(
             'Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...'
         )
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(
         self.startUrl).hostname
     self.checkUrlQuery()
     self.httpHandler = HttpHandler(self.host)
Пример #46
0
 def getResponseByUrl(self, url, headers={}):
     """
         url is the website you want.
         headers is the dict you add dynamicly apart from that in configs
     """
     begin = time.clock()
     if urlparse(url).hostname == None:
         Logger.error('url of request illegal! which is %s' %url)
         return None
     req = urllib.request.Request(url)
     for k,v in configs.urlrequest.items():
         if isinstance(v, list):
             l = len(v)
             v = v[randint(0, len(v) - 1)]
         req.add_header(k,v)
     for k,v in headers.items():
         req.add_header(k,v)
     flag = False
     for i in range(configs.http.retry):
         Logger.debug('%s attempt' %(i+1))
         try:
             if self.selephan != None:
                 response = self.selephan.getReqWithSel(req)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             if self.proxy != None:
                 response= self.proxy.getReqWithProxy(req, timeout=configs.proxy.timeout)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             response = urllib.request.urlopen(req, timeout=configs.http.timeout)
             flag = True
             break
         except Exception as e:
             continue
     end = time.clock()
     Logger.debug('HTTP request time: %ss' %(end - begin))
     if flag:
         return response
     else:
         return None
Пример #47
0
 def __init__(self):
     Logger.info('espider %s initiating...' % self.espiderName)
     if self.startUrl == '' or self.espiderName == '':
         Logger.critical('Your espider should have an espiderName and a startUrl! Espider is shutting down...')
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical('Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...')
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(self.startUrl).hostname
     self.httpHandler = HttpHandler(self.host)
     if not os.path.exists(configs.spider.pipelinepath):
         os.makedirs(configs.spider.pipelinepath)
     self.catalogueUrl = set()
     self.catalogueCount = 0
     self.contentCount = 0
     self.contentDictList = []
     self.uncatchableUrlList = []
Пример #48
0
 def startEspider(self):
     Logger.info('starting espider...')
     paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt')
     if paramList == None:
         Logger.critical('You should create starting parameters in %s' %
                         (configs.spider.contentdatapath + 'param.txt'))
         exit(1)
     for i in range(len(paramList)):
         paramList[i] = json.loads(paramList[i])
         for k, v in paramList[i].items():
             if k in self.parameterList[0]:
                 param = {}
                 param[k] = v
                 path = configs.spider.contentdatapath + k + '=' + v + '/'
                 self.catalogueUrlRecursion(param, path, 1)
             else:
                 Logger.error(
                     'param.txt gives an incorrect key compared to self.paramterList...'
                 )
Пример #49
0
 def testProxy(self):
     """
         Test the proxy connection performance with self.startUrl.
     """
     req = urllib.request.Request(self.startUrl)
     for k, v in configs.urlrequest.items():
         if isinstance(v, list):
             l = len(v)
             v = v[randint(0, len(v) - 1)]
         req.add_header(k, v)
     Logger.info('test proxy list in %s' % configs.proxy.srcname)
     data = readLinesFile(configs.proxy.srcname)
     time.clock()
     proxyList = []
     for i in range(len(data)):
         proxyList.append(
             dict(zip(('type', 'ip', 'port'), data[i].split('\t'))))
         openner = urllib.request.build_opener(
             urllib.request.ProxyHandler({
                 proxyList[i]['type']:
                 proxyList[i]['ip'] + ':' + proxyList[i]['port']
             }), urllib.request.ProxyBasicAuthHandler())
         try:
             begin = time.clock()
             openner.open(req, timeout=configs.proxy.timeout)
             ping = time.clock() - begin
             available = 1
             Logger.info('proxy %s is good...' % proxyList[i]['ip'])
         except Exception as e:
             Logger.info('proxy %s is not available...' %
                         proxyList[i]['ip'])
             ping = -1
             available = 0
         proxyList[i]['available'] = available
         proxyList[i]['ping'] = ping
     dataset = []
     for i in range(len(proxyList)):
         dataset.append('%s\t%s\t%s\t%s\t%s' %
                        (proxyList[i]['type'], proxyList[i]['ip'],
                         proxyList[i]['port'], proxyList[i]['available'],
                         proxyList[i]['ping']))
     writeLinesFile(configs.proxy.srcname, dataset)
     return
Пример #50
0
 def getFreeProxy(self):
     """
         Two different ways getting free proxy which can be configured in configs.
         You can also define your own way of getting.
     """
     Logger.info('get free proxy from the Internet...')
     proxyList = []
     if configs.proxy.proxysrc == 1:
         for i in range(configs.proxy.srcpage):
             Logger.info('get page %s...' % (i + 1))
             req = urllib.request.Request(
                 'http://www.kuaidaili.com/free/inha/%s/' % (i + 1))
             req.add_header(
                 'User-Agent',
                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0'
             )
             data = urllib.request.urlopen(req).read().decode('utf-8')
             proxy = re.findall(
                 '<tr>\s*?<td data-title="IP">(.*?)</td>\s*?<td data-title="PORT">(.*?)</td>\s*?<td data-title="匿名度">.*?</td>\s*?<td data-title="类型">(.*?)</td>',
                 data)
             for item in proxy:
                 if not [item[2].lower(), item[0], item[1]] in proxyList:
                     proxyList.append([item[2].lower(), item[0], item[1]])
     if configs.proxy.proxysrc == 2:
         for i in range(configs.proxy.srcpage):
             Logger.info('get page %s...' % (i + 1))
             req = urllib.request.Request('http://www.xicidaili.com/nn/%s' %
                                          (i + 1))
             req.add_header(
                 'User-Agent',
                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0'
             )
             data = urllib.request.urlopen(req).read().decode('utf-8')
             proxy = re.findall(
                 '<td class="country"><img [\s\S]*?<td>(.*?)</td>\s*?<td>(.*?)</td>\s*?<td>[\s\S]*?</td>\s*?<td class="country">.*?</td>\s*?<td>(.*)</td>',
                 data)
             for item in proxy:
                 if not [item[2].lower(), item[0], item[1]] in proxyList:
                     proxyList.append([item[2].lower(), item[0], item[1]])
     dataset = []
     for item in proxyList:
         dataset.append('%s\t%s\t%s\t-1\t-1' % (item[0], item[1], item[2]))
     writeLinesFile(configs.proxy.srcname, dataset)
Пример #51
0
 def __init__(self):
     Logger.info('espider %s initiating...' % self.espiderName)
     if self.startUrl == '' or self.espiderName == '':
         Logger.critical(
             'Your espider should have an espiderName and a startUrl! Espider is shutting down...'
         )
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical(
             'Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...'
         )
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(
         self.startUrl).hostname
     self.httpHandler = HttpHandler(self.host)
     if not os.path.exists(configs.spider.pipelinepath):
         os.makedirs(configs.spider.pipelinepath)
     self.catalogueUrl = set()
     self.catalogueCount = 0
     self.contentCount = 0
     self.contentDictList = []
     self.uncatchableUrlList = []
Пример #52
0
 def __init__(self, startUrl = None):
     self.startUrl = startUrl
     path, file = os.path.split(configs.proxy.srcname)
     if not os.path.exists(path):
         os.makedirs(path)
     if startUrl == None:
         self.startUrl = 'http://www.baidu.com'
     if configs.proxy.rescrab:
         Logger.info('rescrab proxylist...')
         self.getFreeProxy()
     if configs.proxy.retest:
         Logger.info('retest proxy list...')
         self.testProxy()
     if not os.path.exists(configs.proxy.srcname):
         self.loadDefaultProxy()
     else:
         self.proxyList = self.loadProxy()
     self.proxyList = list(filter(lambda x:abs(int(x['available'])) == 1, self.proxyList))
     self.proxyList = list(filter(lambda x:float(x['ping']) < 2, self.proxyList))
     if len(self.proxyList) == 0:
         Logger.critical('There is no available proxy! espider is shuting down...')
         exit(1)
     self.proxyList.sort(key = lambda x:1000 if float(x['ping']) == -1 else float(x['ping']))
     self.proxyCount = 0
Пример #53
0
 def getUrlList(self, response):
     Logger.critical('getUrlList() without override! espider is shuting down...')
     exit(1)
Пример #54
0
 def setHeaders(self, headers):
     if not isinstance(headers, dict):
         Logger.error('PhantomJsResponse setHeaders() error: headers is not a dict...')
         return
     self.headers = headers
Пример #55
0
 def setData(self, data):
     if not isinstance(data, str):
         Logger.error('PhantomJsResponse setData() error: data is not a str...')
         return
     self.data = bytes(data, encoding='utf8')
Пример #56
0
 def contentHandler(self, url, count):
     url = urljoin(self.host, url)
     Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url))
     data = None
     type = ''
     name = None
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning('cannot get url %s. please check httphandler...' % url)
             return ('disabled', 'disabled')
         response = EsResponse(response)
         try:
             data, type = self.contentResponseHandle(response)
             if data == None:
                 Logger.debug('data == None')
                 raise Exception
             
             name = self.contentFileName(response)
         except Exception:
             Logger.error('an error occured in getUrlList(). if this take place very often, please check your code')
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
             continue
         break
     if data == None:
         return ('disabled', 'disabled')
     if name == None:
         name = '%s.' % count + type
     if not os.path.exists(configs.spider.contentdatapath):
         os.makedirs(configs.spider.contentdatapath)
     if self.parser == None:
         MD5 = buildMD5String(data)
     else:
         try:
             parsedData = '%s' %self.parser.parseContent(data)
             MD5 = buildMD5String(parsedData)
         except Exception:
             Logger.error('An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5')
             MD5 = buildMD5String(data)
     filepath = configs.spider.contentdatapath + name
     try:
         if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css':
             with open(filepath, 'w+', encoding='utf8') as f:
                 f.write(data)
             return (MD5, filepath)
         if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4':
             with open(filepath, 'wb+') as f:
                 f.write(data)
             return (MD5, filepath)
         with open(filepath, 'wb+') as f:
             f.write(data)
     except OSError:
         Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name)
     return (MD5, filepath)
Пример #57
0
 def contentHandler(self, data):
     Logger.critical('contentHandler() without override! espider is shuting down...')
     exit(1)