Exemplo n.º 1
0
 def __init__(self, host, hostport, mongo, mongoport, redis, redisport,
              web):
     self.status = "RUNNING"
     self.client_id = None
     self.enqued_num = 0
     self.start = False
     self.get = False
     self.socket = SocketClient(host, hostport)
     self.deque = deque()
     self.dbmanager = MongoRedisUrlManager(mongo, mongoport)
     self.dbmanager.enqueuUrl(web, 'new', 0)
     self.dir_name = 'web/'
     self.max_num_thread = 5
     self.CRAWL_DELAY = 5
     self.last_heartbeat_time = time.time()
     self.web = web
     self.client_id = None
     self.request_headers = {
         'host': "www.mafengwo.cn",
         'connection': "keep-alive",
         'cache-control': "no-cache",
         'upgrade-insecure-requests': "1",
         'user-agent':
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
         'accept':
         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
     }
     #start a thread for heartbeat which is  continual
     if not os.path.exists(self.dir_name):
         os.mkdir(self.dir_name)
     try:
         t = threading.Thread(target=self.heartbeat, args=())
         t.setDaemon(True)
         t.start()
     except Exception as err:
         print("failed to start thread HEARTBEAT ,error is " + str(err))
     time.sleep(30)
     if self.get:
         self.thread()
Exemplo n.º 2
0
def getcontent():
    global j
    while j < 100000:
        try:
            url = 'http://www.creditbj.gov.cn/xyData/front/creditService/getPageList.shtml?pageNo=%s&keyword=&typeId=19' % str(
                j)
            data = requests.get(url=url, headers=headers)
            a = (json.loads(data.text))
            db = MongoRedisUrlManager()
            for i in a['hits']['hits']:
                print('开始注入数据', j)
                #print(i['_source'])
                db.db.credit.insert(i['_source'])
        except:
            pass
        j += 1
Exemplo n.º 3
0
#webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] ='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'

re_compiled_obj = re.compile('\d+')

constants = {
    'MAX_PAGE_TRIED': 2,
    'HB_PERIOD': 5,
    'MAX_SIZE_THREADPOOL': 5,
    'CRAWL_DELAY': 2
}
# Initialize system variables
#dir_name = 'mfw/'

# db manager
webdrivers = {}
dbmanager = MongoRedisUrlManager()

is_root_page = True
threads = []
options = webdriver.ChromeOptions()
# 设置中文
options.add_argument('lang=zh_CN.UTF-8')
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
# use hdfs to save pages
# hdfs_client = InsecureClient('http://54.223.92.169:50070', user='******')

socket_client = SocketClient('localhost', 20012)
client_id = 0

hb_period = 5
Exemplo n.º 4
0
class crawler:
    def __init__(self, host, hostport, mongo, mongoport, redis, redisport,
                 web):
        self.status = "RUNNING"
        self.client_id = None
        self.enqued_num = 0
        self.start = False
        self.get = False
        self.socket = SocketClient(host, hostport)
        self.deque = deque()
        self.dbmanager = MongoRedisUrlManager(mongo, mongoport)
        self.dbmanager.enqueuUrl(web, 'new', 0)
        self.dir_name = 'web/'
        self.max_num_thread = 5
        self.CRAWL_DELAY = 5
        self.last_heartbeat_time = time.time()
        self.web = web
        self.client_id = None
        self.request_headers = {
            'host': "www.mafengwo.cn",
            'connection': "keep-alive",
            'cache-control': "no-cache",
            'upgrade-insecure-requests': "1",
            'user-agent':
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
            'accept':
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
        }
        #start a thread for heartbeat which is  continual
        if not os.path.exists(self.dir_name):
            os.mkdir(self.dir_name)
        try:
            t = threading.Thread(target=self.heartbeat, args=())
            t.setDaemon(True)
            t.start()
        except Exception as err:
            print("failed to start thread HEARTBEAT ,error is " + str(err))
        time.sleep(30)
        if self.get:
            self.thread()
        #start multi_process num is equal to cpu_max

    #facing error not solved
    # def multi_process(self,cpu_max):
    #     print("started multi_process")
    #     process=[]

    #     for i in range(cpu_max):
    #         p=multiprocessing.Process(target=self.thread, name=None,daemon=True)
    #         process.append(p)

    #     for p in process:
    #         p.start()
    #         p.join()

    def thread(self):
        print("starting thread")
        threads = []
        while True:
            if self.status == "RUNNING":
                # first remove all finished running threads
                for t in threads:
                    if not t.is_alive():
                        threads.remove(t)
                if len(threads) >= self.max_num_thread:
                    time.sleep(self.CRAWL_DELAY)
                    continue
                try:
                    try:
                        curtask = self.deque.pop()
                        self.deque.append(curtask)
                    except:
                        curtask = None
                    if self.status == "RUNNING" and curtask is not None:
                        curtask = self.deque.pop()
                        t = threading.Thread(target=self.get_page_content,
                                             name=None,
                                             args=(curtask[0], curtask[1]))
                        threads.append(t)
                        t.setDaemon(True)
                        t.start()
                        continue
                    else:
                        continue
            # set daemon so main thread can exit when receives ctrl-c

                except Exception as err:
                    print("Error: unable to start thread" + str(err))
            else:
                continue

    def get_page_content(self, cur_url, depth):
        print("downloading %s at level %d" % (cur_url, depth))
        links = []
        try:
            req = requests.request('GET',
                                   cur_url,
                                   headers=self.request_headers)
            req.encoding = req.apparent_encoding
            html_page = req.text
            filename = cur_url[7:].replace('/', '_')

            # Write page to local files system
            fo = open("%s%s.html" % (self.dir_name, filename), 'wb+')
            fo.write(html_page.encode("utf-8"))
            fo.close()

            self.dbmanager.finishUrl(cur_url)
        except Exception as err:
            print(err)
            return

        html = etree.HTML(html_page.lower())
        hrefs = html.xpath(u"//a")

        for href in hrefs:
            try:
                if 'href' in href.attrib:
                    val = href.attrib['href']
                    if val.find('javascript:') != -1:
                        continue
                    if val.startswith('http://') is False:
                        if val.startswith('/'):
                            val = self.web + val
                        else:
                            continue
                    if val[-1] == '/':
                        val = val[0:-1]
                    links.append(val)
                    while not (self.status == "RUNNING"):
                        time.sleep(5)
                    self.dbmanager.enqueuUrl(val, 'new', depth + 1)
                    self.enqued_num += 1
            except ValueError:
                continue

        self.dbmanager.set_url_links(cur_url, links)

    def on_massage(self, server_response):
        request = server_response
        try:
            massage = request["MASSAGE"]
            if massage == "PAUSE" and self.status == "RUNNING":
                self.status = "PAUSED"
                print("receiving data {} from server".format(request))
            elif massage == "RESUME" and self.status == "PAUSED":
                self.status = "RUNNING"
                print("receiving data {} from server".format(request))
            elif massage == "WAIT" and self.status == "PAUSED":
                self.status = "PAUSED"
            elif massage == "REGISTERED" and self.client_id is None:
                self.client_id = request["CLIENT_ID"]
                print("receiving data {} from server".format(request))
            else:
                print("MASSAGE_TYPE is invalid")
        except:
            fetch_num = len(request["URLS"])
            if fetch_num > 0:
                for i in request["URLS"]:
                    self.deque.append(i)
                    self.get = True
                print("receiving data {} from server".format(request))
            elif request == {}:
                print("normal HEARTBEAT response received")
            else:
                print("invalid request from server")
                return None

    def heartbeat(self):
        request = {}
        #register client in server

        if self.client_id is None:
            request["MASSAGE_TYPE"] = "REGISTER"
            request["CLIENT_STATUS"] = self.status
            server_response = self.socket.send(json.dumps(request))
            self.on_massage(server_response)
            self.last_heartbeat_time = time.time()
        #use while infinite loop to send massages like heartbeat and fetch_url
        while True:
            #if deque is empty ,URL_REQUEST is prior to HEARTBEAT ,and as a substitution
            if self.enqued_num > 30:
                self.start = False

            if self.start == True:
                continue

            try:
                a1 = self.deque.pop()
                self.deque.append(a1)
            except:
                a1 = None
            time1 = time.time()
            if a1 is None and time1 - self.last_heartbeat_time >= 40:
                request["MASSAGE_TYPE"] = "URL_REQUEST"
                request["CLIENT_STATUS"] = self.status
                request["CLIENT_ID"] = self.client_id
                server_response = self.socket.send(json.dumps(request))
                self.on_massage(server_response)
                self.last_heartbeat_time = time.time()
            elif time1 - self.last_heartbeat_time >= 40:
                request["MASSAGE_TYPE"] = "HEARTBEAT"
                request["CLIENT_STATUS"] = self.status
                request["CLIENT_ID"] = self.client_id
                server_response = self.socket.send(json.dumps(request))
                self.on_massage(server_response)
                self.last_heartbeat_time = time.time()
            elif a1 is None:
                request["MASSAGE_TYPE"] = "URL_REQUEST"
                request["CLIENT_STATUS"] = self.status
                request["CLIENT_ID"] = self.client_id
                server_response = self.socket.send(json.dumps(request))
                self.on_massage(server_response)
                self.last_heartbeat_time = time.time()
                self.start = True
            else:
                continue
                if val.find('javascript:') != -1:
                    continue
                if val.startswith('http://') is False:
                    if val.startswith('/'):
                        val = 'http://www.mafengwo.cn' + val
                    else:
                        continue
                if val[-1] == '/':
                    val = val[0:-1]
                dbmanager.enqueueUrl(val, 'new', depth + 1)
        except ValueError:
            continue


max_num_thread = 5
dbmanager = MongoRedisUrlManager()

dbmanager.enqueueUrl("http://www.mafengwo.cn", 'new', 0)

start_time = time.time()
is_root_page = True
threads = []

CRAWL_DELAY = 0.6

# use hdfs to save pages
# hdfs_client = InsecureClient('http://54.223.92.169:50070', user='******')

while True:
    curtask = dbmanager.dequeueUrl()
    print curtask
Exemplo n.º 6
0
    webdriver.DesiredCapabilities.PHANTOMJS[
        'phantomjs.page.customHeaders.{}'.format(key)] = value

# another way to set custome header
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = \
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'

constants = {
    'MAX_PAGE_TRIED': 2,
    'HB_PERIOD': 5,
    'MAX_SIZE_THREADPOOL': 3,
    'CRAWL_DELAY': 2
}

# db manager
dbmanager = MongoRedisUrlManager()

start_time = time.time()
threads = {}
webdrivers = {}

socket_client = SocketClient('localhost', 20100)

register_request = {}
register_request[pc.MSG_TYPE] = pc.REGISTER
client_id = socket_client.send(json.dumps(register_request))

run_heartbeat = True
server_status = pc.STATUS_RUNNING

re_compiled_obj = re.compile('\d{7}')
Exemplo n.º 7
0
class CrawlMaster(object):
    # 客户端注册表,{'client_id': {'time':'xx', 'status':'xx'}}
    clients = {}

    server_status = pc.STATUS_RUNNING

    last_rereoder_time = time.time()

    dbmanager = MongoRedisUrlManager()

    def __init__(self, mongo_client=None, mongo_host='127.0.0.1'):
        self.server = ServerSocket(self.on_message)
        self.server.start()

    def on_message(self, msg):
        #msg 是client发送过来的心跳信息
        request = json.loads(msg)
        type = request[pc.MSG_TYPE]
        client_state = {}
        response = {}
        response[pc.SERVER_STATUS] = self.server_status
        if type == pc.REGISTER:
            client_id = self.get_free_id()
            client_state['status'] = pc.STATUS_RUNNING
            client_state['time'] = time.time()
            self.clients[client_id] = client_state
            return client_id
        elif type == pc.UNREGISTER:
            client_id = request.get(pc.CLIENT_ID)
            del self.clients[client_id]
            return json.dumps(response)
        elif type == pc.LOCATIONS:
            crawl_urls = self.dbmanager.dequeueUrls(size=pc.REQUEST_SIZE)
            print(crawl_urls)
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.CRAWL_DELAY] = pc.CRAWL_DELAY_TIME
            response[pc.DATA] = crawl_urls
            self.flash_hbtime(request)
            return json.dumps(response)
        elif type == pc.TRIPLES:
            crawl_urls = self.dbmanager.dequeueUrls(request[pc.REQUEST_SIZE])
            response[pc.MSG_TYPE] = pc.LOCATIONS
            response[pc.DATA] = crawl_urls
            self.flash_hbtime(request)
            return json.dumps(response)
        elif type == pc.FINISHED_ITEMS:
            # new urls from client save to db by master
            save_urls = request.get(pc.FINISHED_ITEMS)
            self.dbmanager.enqueueUrls(save_urls)
            self.flash_hbtime(request)
            return json.dumps(response)


        client_id = request.get(pc.CLIENT_ID)
        if client_id is None:
            response[pc.ERROR] = pc.ERR_NOT_FOUND
            return json.dumps(response)
        if type == pc.HEARTBEAT:
            if self.server_status is not self.clients[client_id]['status']:
                if self.server_status == pc.STATUS_RUNNING:
                    response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED
                elif self.server_status == pc.STATUS_PAUSED:
                    response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED
                elif self.server_status == pc.STATUS_SHUTDOWN:
                    response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED
                return json.dumps(response)
            else:
                # a normal heart beat
                self.flash_hbtime(request)
                return json.dumps(response)
        else:
            if type == pc.PAUSED:
                client_state['status'] = pc.STATUS_PAUSED
            elif type == pc.RESUMED:
                client_state['status'] = pc.STATUS_RUNNING
            client_state['time'] = time.time() #flash hb time
            self.clients[client_id] = client_state

            return json.dumps(response)

    def periodical_check(self):
        # check heart beat
        # clients_status_ok = True
        while True:
            lost_cid = []
            for cid, state in self.clients.items():
                if time.time() - state['time'] > constants['connection_lost_period']:
                    # del self.clients[cid] -> reason:dictionary changed size during iteration
                    self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST
                    lost_cid.append(cid)
                    continue

            for cid in lost_cid:
                if self.clients[cid]['status'] != self.server_status:
                    # remove if from client list
                    del self.clients[cid]

            time.sleep(PERIODICAL_CHECK_TIME)


    def get_free_id(self):
        i = 0
        for key in self.clients:
            if i < int(key):
                break
            i += 1
        return str(i)

    def flash_hbtime(self, request):
        client_id = request.get(pc.CLIENT_ID)
        self.clients[client_id]['time'] = time.time()
Exemplo n.º 8
0
        pass
    except Exception as err:
        print("get_page_content()", err)
        pass
    try:
        items = re.findall('//www.lagou.com/jobs/\d+.html',
                           r.data.decode('utf-8'))
        links = []
        for i in items:
            fullurl = 'https:' + i
            #print(fullurl)
            db.enqueueUrl(fullurl, 'new')
            links.append(fullurl)
        print(links)
    except:
        pass


def crawl():
    while True:
        #print(cur_queue)
        #url = dequeuUrl()
        url = db.dequeueUrl()['url']
        get_page_content(url)


if __name__ == "__main__":
    db = MongoRedisUrlManager()
    db.clear()
    db.enqueueUrl('https://www.lagou.com', 'new')
    crawl()