def __init__(self): try: self.db = sys.path[0] + '/data/db_crawler' self.conn = sqlite3.connect(self.db, check_same_thread=False) self.cursor = self.conn.cursor() except Exception as e: func.logger(self.__class__.__name__, time.strftime( '%Y-%m-%d %H:%M:%S') + ' ' + str(e))
def __init__(self): try: self.db = sys.path[0] + '/data/db_crawler' self.conn = sqlite3.connect(self.db, check_same_thread=False) self.cursor = self.conn.cursor() except Exception as e: func.logger(self.__class__.__name__, time.strftime('%Y-%m-%d %H:%M:%S') + ' ' + str(e))
def createTable(self, t_name, file): try: with open(sys.path[0] + '/' + file, 'r') as f: q = f.read() self.cursor.execute(q) except Exception as e: func.logger(self.__class__.__name__, time.strftime( '%Y-%m-%d %H:%M:%S') + ' ' + str(e))
def createTable(self, t_name, file): try: with open(sys.path[0] + '/' + file, 'r') as f: q = f.read() self.cursor.execute(q) except Exception as e: func.logger(self.__class__.__name__, time.strftime('%Y-%m-%d %H:%M:%S') + ' ' + str(e))
def fire(self): try: for i in range(1, 31): self.total_new = self.total_new + self.singleRequest(i) except Exception as e: print('Error: ' + str(e)) func.logger('crawler', time.strftime( '%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e)) finally: msg = '%s Time cost(Synchro):%.4f New item:%d Request:%.4f Select:%.4f Save:%.4f' % ( time.strftime('%Y-%m-%d %H:%M:%S'), self.t0, self.total_new, self.t1, self.t2, self.t3) func.logger('crawler', msg)
def fetchPageContent(self, post={}): try: f = urllib.request.urlopen( url=self.url_base + self.url_params, data=urllib.parse.urlencode(post).encode('utf-8'), timeout=2) d = f.read().decode('utf-8') d = json.loads(d) return d['content']['result'] except Exception as e: msg = time.strftime( '%Y-%m-%d %H:%M:%S') + ' [error][network] ' + str(e) func.logger('crawler', msg) return []
def singleRequest(self, i): ''' do not invoke fetchPageContent, use aiohttp instead ''' try: response = yield from aiohttp.request('post', url=self.url_base + self.url_params, data={'pn': i}) d = yield from asyncio.wait_for(response.read_and_close(decode=True), timeout=1) # save data c = self.savePageContent(d['content']['result']) print('Page %2d : %d items were added' % (i, c)) self.total_new = self.total_new + c except Exception as e: msg = time.strftime( '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e) func.logger('crawler', msg)
def __init__(self): self.total_new = 0 self.table = 'lagou_basic' self.table2 = 'lagou_company_label' self.url_base = 'http://www.lagou.com' self.url_params = '/jobs/positionAjax.json?px=new' try: # generate insert query self.model = model.dbSqlite() self.iq_1 = self.model.insertQuery(self.table) self.ip_1 = self.model.insertParam(self.table) except Exception as e: msg = time.strftime('%Y-%m-%d %H:%M:%S') + '[Error][Init] ' + str( e) func.logger('crawler', msg) exit()
def __init__(self): self.total_new = 0 self.table = 'lagou_basic' self.table2 = 'lagou_company_label' self.url_base = 'http://www.lagou.com' self.url_params = '/jobs/positionAjax.json?px=new' try: # generate insert query self.model = model.dbSqlite() self.iq_1 = self.model.insertQuery(self.table) self.ip_1 = self.model.insertParam(self.table) except Exception as e: msg = time.strftime( '%Y-%m-%d %H:%M:%S') + '[Error][Init] ' + str(e) func.logger('crawler', msg) exit()
def fire(self): try: s = time.time() loop = asyncio.get_event_loop() tasks = [self.singleRequest(i) for i in range(1, 31)] loop.run_until_complete(asyncio.wait(tasks, timeout=100)) self.session.close() loop.close() except Exception as e: print('Error: ' + str(e)) func.logger('crawler_error', time.strftime( '%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e)) finally: msg = '%s Time cost(Asynchr):%.4f New item:%d' % ( time.strftime('%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new) func.logger('crawler', msg)
def fire(self): try: for i in range(1, 31): self.total_new = self.total_new + self.singleRequest(i) except Exception as e: print("Error: " + str(e)) func.logger("crawler", time.strftime("%Y-%m-%d %H:%M:%S ") + "[error] " + str(e)) finally: msg = "%s Time cost(Synchro):%.4f New item:%d Request:%.4f Select:%.4f Save:%.4f" % ( time.strftime("%Y-%m-%d %H:%M:%S"), self.t0, self.total_new, self.t1, self.t2, self.t3, ) func.logger("crawler", msg)
def addRecord(self, data): try: p = list(map(lambda x: data.get(x), self.ip_1)) self.model.cursor.execute(self.iq_1, p) self.model.conn.commit() # insert into lagou_company_label p = list( map(lambda x: (data['positionId'], x), data['companyLabelList'])) q = "insert into " + self.table2 + \ " (position_id, label) values(?,?)" self.model.cursor.executemany(q, p) self.model.conn.commit() return True except Exception as e: msg = time.strftime( '%Y-%m-%d %H:%M:%S') + ' [error][database] ' + str(e) func.logger('crawler', msg) return False
async def singleRequest(self, i): ''' do not invoke fetchPageContent, use aiohttp instead ''' try: t1 = time.time() print('start request :' + str(i)) async with self.session.post(url=self.url_base + self.url_params, data={'pn': i}) as response: d = await response.json() # save data c = self.savePageContent( d['content']['positionResult']['result']) print('Page %2d : %d items added, using %.4f secs' % (i, c, time.time() - t1)) self.total_new = self.total_new + c except Exception as e: msg = time.strftime( '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(i) + ' ' + str(e) func.logger('crawler_error', msg)
def singleRequest(self, i): ''' do not invoke fetchPageContent, use aiohttp instead ''' try: response = yield from aiohttp.request('post', url=self.url_base + self.url_params, data={'pn': i}) d = yield from asyncio.wait_for( response.read_and_close(decode=True), timeout=1) # save data c = self.savePageContent(d['content']['result']) print('Page %2d : %d items were added' % (i, c)) self.total_new = self.total_new + c except Exception as e: msg = time.strftime( '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e) func.logger('crawler', msg)
def fire(self): try: s = time.time() # set 10 threads for i in range(1, 10): t = threading.Thread(target=self.working) t.setDaemon(True) t.start() # put task into queue for i in range(1, 31): self.task_queue.put(i) # block threads. continue until all threads finished self.task_queue.join() except Exception as e: func.logger('crawler_error', time.strftime( '%Y-%m-%d %H:%M:%S ') + '[error][main] ' + str(e)) finally: msg = '%s Time cost(Threads):%.4f New item:%d' % ( time.strftime('%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new) func.logger('crawler', msg)
def fire(self): try: s = time.time() # set 10 threads for i in range(1, 10): t = threading.Thread(target=self.working) t.setDaemon(True) t.start() # put task into queue for i in range(1, 31): self.task_queue.put(i) # block threads. continue until all threads finished self.task_queue.join() except Exception as e: func.logger( 'crawler', time.strftime('%Y-%m-%d %H:%M:%S ') + '[error][main] ' + str(e)) finally: msg = '%s Time cost(Threads):%.4f New item:%d' % (time.strftime( '%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new) func.logger('crawler', msg)
'%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e) func.logger('crawler', msg) @asyncio.coroutine def bug(): raise Exception("not consumed") # trigger def fire(self): try: s = time.time() loop = asyncio.get_event_loop() tasks = [ asyncio. async (self.singleRequest(i)) for i in range(1, 31) ] loop.run_until_complete(asyncio.wait(tasks)) loop.close() except Exception as e: print('Error: ' + str(e)) func.logger( 'crawler', time.strftime('%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e)) finally: msg = '%s Time cost(Asynchr):%.4f New item:%d' % (time.strftime( '%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new) func.logger('crawler', msg) a = CrawlerAsync() a.fire()
except Exception as e: msg = time.strftime( '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e) func.logger('crawler', msg) @asyncio.coroutine def bug(): raise Exception("not consumed") # trigger def fire(self): try: s = time.time() loop = asyncio.get_event_loop() tasks = [asyncio.async(self.singleRequest(i)) for i in range(1, 31)] loop.run_until_complete(asyncio.wait(tasks)) loop.close() except Exception as e: print('Error: ' + str(e)) func.logger('crawler', time.strftime( '%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e)) finally: msg = '%s Time cost(Asynchr):%.4f New item:%d' % ( time.strftime('%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new) func.logger('crawler', msg) a = CrawlerAsync() a.fire()