Пример #1
0
def mtgox_websocket_connect(s, proxytype=None,proxy=None):
	host='websocket.mtgox.com'
	port=80

	if proxytype=='http':
		s.connect(proxy)
		Proxies.do_http_connect(s,host,port)
	elif proxytype=='socks4a':
		s.connect(proxy)
		Proxies.do_socks4a(s,host,port)
	else:
		s.connect((host,port))

	querylines = [
		'GET /mtgox HTTP/1.1',
		'Upgrade: WebSocket',
		'Connection: Upgrade',
		'Host: websocket.mtgox.com',
		'Origin: null',
	]
	s.send('\r\n'.join(querylines)+'\r\n\r\n')

	lines = []
	while True:
		line = recv_line(s)
		if line=='': break
		lines.append(line)

	if lines[0]<>'HTTP/1.1 101 Web Socket Protocol Handshake':
		raise Exception('server did not say "101" (said %s)'%repr(lines[0]))

	return s
Пример #2
0
def do_https_query(url, postdata=None, reqheaders=None, digest=None, digesttype='sha256', proxy=None, proxytype=None, timeout=None):
	scheme,netloc,path,query,fragment = urlparse.urlsplit(url)
	if scheme<>'https':
		raise Exception('not an HTTPS query')

	if len(netloc.split('@'))>2:
		raise Exception('username/password not supported')

	hp = netloc.split(':')
	if len(hp)>2 or len(hp)<1:
		raise Exception('netloc must be hostname:port')

	host = hp[0]
	if len(hp)>=2:
		port = int(hp[1])
	else:
		port = 443

	if timeout<>None:
		signal.alarm(timeout)

	s = socket.socket()

	if proxytype<>None:
		s.connect(proxy)
		Proxies.do_proxy_connect(s,proxytype,host,port)
	else:
		s.connect((host,port))

	ctx = OpenSSL.SSL.Context(OpenSSL.SSL.TLSv1_METHOD)
	c = OpenSSL.SSL.Connection(ctx,s)
	c.set_connect_state()
	c.do_handshake()

	if timeout<>None:
		signal.alarm(0)

	if digest<>None:
		cert = c.get_peer_certificate()
		digestgot = cert.digest(digesttype)
		if digestgot<>digest:
			raise Exception('server certificate mismatch (%s)'%digestgot)
	
	result = do_http_query(c, host, path+query, postdata, headers=reqheaders)

	c.close()
	return result
Пример #3
0
    def crawl_job(self):
        count = 0
        now = datetime.now()
        current_time = now.strftime("%H:%M %d-%m-%Y")
        print('\n====== Starting Crawling at : ' + str(current_time) +
              ' ==============================\n')

        count_prox = 1

        if self.num_prox > 0:
            proxy = prox.Proxies(number_of_proxies=1).getProxiesAllInOne()[0]

        for url in self.stations_url:
            # Get name
            name = url.split('stations')[-1].replace('/', '')
            # Get Proxy { ip: xxxx , port: xxxxx}
            if self.num_prox > 0 and count_prox % self.num_prox == 0:
                print("Searching for new proxy")
                proxy = prox.Proxies(
                    number_of_proxies=1).getProxiesAllInOne()[0]

            count_prox += 1
            # create Instance for crawl
            cs = cS.Crawl_Station(url, proxy=proxy)
            # Get Data
            data = cs.getInfo()
            if len(data) >= 6:
                count += 1
            #print(data)
            data['TimeCrawled'] = now.strftime("%H:%M")
            self.writeCSV(name, data)
            #time.sleep(5)

        now = datetime.now()
        current_time = now.strftime("%H:%M %d-%m-%Y")
        print('\n====== Crawling Ended at ' + current_time +
              ' ==============================\n')
        self.writeLogFile(current_time, count)
Пример #4
0
    "Cookie":
    "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644"
    .format(timeStamp=timeStamp, time=time1),
    "Referer":
    "https://m.lagou.com/search.html",
    "User-Agent":
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
    "X-Requested-With":
    "XMLHttpRequest",
}

city = "广州"
positionName = "python"
# pageNo = "1"
pageSize = "15"
proxies = Proxies()


def get_detail_url(pageNo, proxies):
    base_url = "https://m.lagou.com/search.json?city={city}&positionName={positionName}&pageNo={pageNo}&" \
               "pageSize={pageSize}".format(city=city,positionName=positionName,pageNo=pageNo,pageSize=pageSize)
    res = requests.get(base_url, headers=headers, proxies=proxies)
    content = res.content.decode()
    dict1 = json.loads(content)
    # print(dict1)
    list1 = dict1['content']['data']['page']['result']
    for i in list1:
        yield "https://m.lagou.com/jobs/{}.html".format(i['positionId'])


# 职位名称  薪资  工作地点  工作年限  学历要求 企业名字   职位描述