예제 #1
0
 def __init__(self):
     try:
         self.db = sys.path[0] + '/data/db_crawler'
         self.conn = sqlite3.connect(self.db, check_same_thread=False)
         self.cursor = self.conn.cursor()
     except Exception as e:
         func.logger(self.__class__.__name__, time.strftime(
             '%Y-%m-%d %H:%M:%S') + ' ' + str(e))
예제 #2
0
 def __init__(self):
     try:
         self.db = sys.path[0] + '/data/db_crawler'
         self.conn = sqlite3.connect(self.db, check_same_thread=False)
         self.cursor = self.conn.cursor()
     except Exception as e:
         func.logger(self.__class__.__name__,
                     time.strftime('%Y-%m-%d %H:%M:%S') + ' ' + str(e))
예제 #3
0
 def createTable(self, t_name, file):
     try:
         with open(sys.path[0] + '/' + file, 'r') as f:
             q = f.read()
             self.cursor.execute(q)
     except Exception as e:
         func.logger(self.__class__.__name__, time.strftime(
             '%Y-%m-%d %H:%M:%S') + ' ' + str(e))
예제 #4
0
 def createTable(self, t_name, file):
     try:
         with open(sys.path[0] + '/' + file, 'r') as f:
             q = f.read()
             self.cursor.execute(q)
     except Exception as e:
         func.logger(self.__class__.__name__,
                     time.strftime('%Y-%m-%d %H:%M:%S') + ' ' + str(e))
예제 #5
0
 def fire(self):
     try:
         for i in range(1, 31):
             self.total_new = self.total_new + self.singleRequest(i)
     except Exception as e:
         print('Error: ' + str(e))
         func.logger('crawler', time.strftime(
             '%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e))
     finally:
         msg = '%s Time cost(Synchro):%.4f New item:%d Request:%.4f Select:%.4f Save:%.4f' % (
             time.strftime('%Y-%m-%d %H:%M:%S'), self.t0, self.total_new, self.t1, self.t2, self.t3)
         func.logger('crawler', msg)
예제 #6
0
 def fetchPageContent(self, post={}):
     try:
         f = urllib.request.urlopen(
             url=self.url_base + self.url_params,
             data=urllib.parse.urlencode(post).encode('utf-8'),
             timeout=2)
         d = f.read().decode('utf-8')
         d = json.loads(d)
         return d['content']['result']
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + ' [error][network] ' + str(e)
         func.logger('crawler', msg)
         return []
예제 #7
0
 def fetchPageContent(self, post={}):
     try:
         f = urllib.request.urlopen(
             url=self.url_base + self.url_params,
             data=urllib.parse.urlencode(post).encode('utf-8'),
             timeout=2)
         d = f.read().decode('utf-8')
         d = json.loads(d)
         return d['content']['result']
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + ' [error][network] ' + str(e)
         func.logger('crawler', msg)
         return []
예제 #8
0
 def singleRequest(self, i):
     '''
     do not invoke fetchPageContent, use aiohttp instead
     '''
     try:
         response = yield from aiohttp.request('post', url=self.url_base + self.url_params, data={'pn': i})
         d = yield from asyncio.wait_for(response.read_and_close(decode=True), timeout=1)
         # save data
         c = self.savePageContent(d['content']['result'])
         print('Page %2d : %d items were added' % (i, c))
         self.total_new = self.total_new + c
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e)
         func.logger('crawler', msg)
예제 #9
0
 def __init__(self):
     self.total_new = 0
     self.table = 'lagou_basic'
     self.table2 = 'lagou_company_label'
     self.url_base = 'http://www.lagou.com'
     self.url_params = '/jobs/positionAjax.json?px=new'
     try:
         # generate insert query
         self.model = model.dbSqlite()
         self.iq_1 = self.model.insertQuery(self.table)
         self.ip_1 = self.model.insertParam(self.table)
     except Exception as e:
         msg = time.strftime('%Y-%m-%d %H:%M:%S') + '[Error][Init] ' + str(
             e)
         func.logger('crawler', msg)
         exit()
예제 #10
0
 def __init__(self):
     self.total_new = 0
     self.table = 'lagou_basic'
     self.table2 = 'lagou_company_label'
     self.url_base = 'http://www.lagou.com'
     self.url_params = '/jobs/positionAjax.json?px=new'
     try:
         # generate insert query
         self.model = model.dbSqlite()
         self.iq_1 = self.model.insertQuery(self.table)
         self.ip_1 = self.model.insertParam(self.table)
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + '[Error][Init] ' + str(e)
         func.logger('crawler', msg)
         exit()
예제 #11
0
 def fire(self):
     try:
         s = time.time()
         loop = asyncio.get_event_loop()
         tasks = [self.singleRequest(i)
                  for i in range(1, 31)]
         loop.run_until_complete(asyncio.wait(tasks, timeout=100))
         self.session.close()
         loop.close()
     except Exception as e:
         print('Error: ' + str(e))
         func.logger('crawler_error', time.strftime(
             '%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e))
     finally:
         msg = '%s Time cost(Asynchr):%.4f New item:%d' % (
             time.strftime('%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new)
         func.logger('crawler', msg)
예제 #12
0
 def fire(self):
     try:
         for i in range(1, 31):
             self.total_new = self.total_new + self.singleRequest(i)
     except Exception as e:
         print("Error: " + str(e))
         func.logger("crawler", time.strftime("%Y-%m-%d %H:%M:%S ") + "[error] " + str(e))
     finally:
         msg = "%s Time cost(Synchro):%.4f New item:%d Request:%.4f Select:%.4f Save:%.4f" % (
             time.strftime("%Y-%m-%d %H:%M:%S"),
             self.t0,
             self.total_new,
             self.t1,
             self.t2,
             self.t3,
         )
         func.logger("crawler", msg)
예제 #13
0
 def addRecord(self, data):
     try:
         p = list(map(lambda x: data.get(x), self.ip_1))
         self.model.cursor.execute(self.iq_1, p)
         self.model.conn.commit()
         # insert into lagou_company_label
         p = list(
             map(lambda x: (data['positionId'], x), data['companyLabelList']))
         q = "insert into " + self.table2 + \
             " (position_id, label) values(?,?)"
         self.model.cursor.executemany(q, p)
         self.model.conn.commit()
         return True
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + ' [error][database] ' + str(e)
         func.logger('crawler', msg)
         return False
예제 #14
0
 async def singleRequest(self, i):
     '''
     do not invoke fetchPageContent, use aiohttp instead
     '''
     try:
         t1 = time.time()
         print('start request :' + str(i))
         async with self.session.post(url=self.url_base + self.url_params, data={'pn': i}) as response:
             d = await response.json()
             # save data
             c = self.savePageContent(
                 d['content']['positionResult']['result'])
             print('Page %2d : %d items added, using %.4f secs' %
                   (i, c, time.time() - t1))
             self.total_new = self.total_new + c
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(i) + ' ' + str(e)
         func.logger('crawler_error', msg)
예제 #15
0
 def singleRequest(self, i):
     '''
     do not invoke fetchPageContent, use aiohttp instead
     '''
     try:
         response = yield from aiohttp.request('post',
                                               url=self.url_base +
                                               self.url_params,
                                               data={'pn': i})
         d = yield from asyncio.wait_for(
             response.read_and_close(decode=True), timeout=1)
         # save data
         c = self.savePageContent(d['content']['result'])
         print('Page %2d : %d items were added' % (i, c))
         self.total_new = self.total_new + c
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e)
         func.logger('crawler', msg)
예제 #16
0
 def addRecord(self, data):
     try:
         p = list(map(lambda x: data.get(x), self.ip_1))
         self.model.cursor.execute(self.iq_1, p)
         self.model.conn.commit()
         # insert into lagou_company_label
         p = list(
             map(lambda x: (data['positionId'], x),
                 data['companyLabelList']))
         q = "insert into " + self.table2 + \
             " (position_id, label) values(?,?)"
         self.model.cursor.executemany(q, p)
         self.model.conn.commit()
         return True
     except Exception as e:
         msg = time.strftime(
             '%Y-%m-%d %H:%M:%S') + ' [error][database] ' + str(e)
         func.logger('crawler', msg)
         return False
예제 #17
0
    def fire(self):
        try:
            s = time.time()
            # set 10 threads
            for i in range(1, 10):
                t = threading.Thread(target=self.working)
                t.setDaemon(True)
                t.start()
            # put task into queue
            for i in range(1, 31):
                self.task_queue.put(i)
            # block threads. continue until all threads finished
            self.task_queue.join()

        except Exception as e:
            func.logger('crawler_error', time.strftime(
                '%Y-%m-%d %H:%M:%S ') + '[error][main] ' + str(e))
        finally:
            msg = '%s Time cost(Threads):%.4f New item:%d' % (
                time.strftime('%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new)
            func.logger('crawler', msg)
예제 #18
0
    def fire(self):
        try:
            s = time.time()
            # set 10 threads
            for i in range(1, 10):
                t = threading.Thread(target=self.working)
                t.setDaemon(True)
                t.start()
            # put task into queue
            for i in range(1, 31):
                self.task_queue.put(i)
            # block threads. continue until all threads finished
            self.task_queue.join()

        except Exception as e:
            func.logger(
                'crawler',
                time.strftime('%Y-%m-%d %H:%M:%S ') + '[error][main] ' +
                str(e))
        finally:
            msg = '%s Time cost(Threads):%.4f New item:%d' % (time.strftime(
                '%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new)
            func.logger('crawler', msg)
예제 #19
0
                '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e)
            func.logger('crawler', msg)

    @asyncio.coroutine
    def bug():
        raise Exception("not consumed")

    # trigger
    def fire(self):
        try:
            s = time.time()
            loop = asyncio.get_event_loop()
            tasks = [
                asyncio. async (self.singleRequest(i)) for i in range(1, 31)
            ]
            loop.run_until_complete(asyncio.wait(tasks))
            loop.close()
        except Exception as e:
            print('Error: ' + str(e))
            func.logger(
                'crawler',
                time.strftime('%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e))
        finally:
            msg = '%s Time cost(Asynchr):%.4f New item:%d' % (time.strftime(
                '%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new)
            func.logger('crawler', msg)


a = CrawlerAsync()
a.fire()
예제 #20
0
        except Exception as e:
            msg = time.strftime(
                '%Y-%m-%d %H:%M:%S') + " [error][asyncio] " + str(e)
            func.logger('crawler', msg)

    @asyncio.coroutine
    def bug():
        raise Exception("not consumed")

    # trigger
    def fire(self):
        try:
            s = time.time()
            loop = asyncio.get_event_loop()
            tasks = [asyncio.async(self.singleRequest(i))
                     for i in range(1, 31)]
            loop.run_until_complete(asyncio.wait(tasks))
            loop.close()
        except Exception as e:
            print('Error: ' + str(e))
            func.logger('crawler', time.strftime(
                '%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e))
        finally:
            msg = '%s Time cost(Asynchr):%.4f New item:%d' % (
                time.strftime('%Y-%m-%d %H:%M:%S'), time.time() - s, self.total_new)
            func.logger('crawler', msg)


a = CrawlerAsync()
a.fire()