def login(): retry_times = 0 while True: try: idx = random.randint(0, len(login_users) - 1) login_user = login_users[idx] logger.info(login_user) flag = -1 while flag != 0: s = my_request.get_https_session(new=True, agent=True) (flag, r) = my_request.get(logger, "https://www.itjuzi.com/user/login") logger.info(r.status_code) if flag == 0 and r.status_code != 200: flag = -1 logger.info(r.headers["Set-Cookie"]) r = s.post("https://www.itjuzi.com/user/login", data={ "identity": login_user["name"], "password": login_user["pwd"] }, timeout=10) logger.info(r.headers["Refresh"]) if "0;url=https://www.itjuzi.com/" == r.headers["Refresh"]: return True except Exception, ex: logger.exception(ex) time.sleep(10) '''
def fetch_alexa(domain): alexa = trends_tool.get_alexa(domain) url = 'http://www.alexa.cn/index.php?url='+domain proxy = {'type': 'http', 'anonymity':'high', 'country': 'cn', 'ping': 5} while True: s = my_request.get_single_session(proxy, new=True, agent=False) (flag, r) = my_request.get(logger, url) if flag == 0: break d = pq(r.text) data = d('script').text() data = ''.join(data) (ids, ) = util.re_get_result("showHint\('(\S*)'\);", data) id_arr = ids.split(',') domain = id_arr[0] while True: timeout = 10 try: r = s.post("http://www.alexa.cn/api_150710.php", data={"url": id_arr[0], "sig": id_arr[1], "keyt":id_arr[2] }, timeout=timeout) break except Exception,ex: logger.exception(ex) timeout = 20
def query_by_domain(source_company_id, website): if website is None or website == "": return True s = urlsplit(website) if s.query != '' or s.fragment != '': return True if s.path != '' and s.path != '/': return True s = tldextract.extract(website) if s.subdomain != "www" and s.subdomain != "m" and s.subdomain != "": return True try: domain = get_tld(website) except: return True result = conn.get("select count(*) cnt from source_domain where sourceCompanyId=%s and domain=%s", source_company_id, domain) if result["cnt"] > 0: return True url = "http://beian.links.cn/beian.asp?beiantype=domain&keywords=%s" % domain (flag, r) = my_request.get(logger, url) #logger.info(r.text) if flag != 0 or r.status_code != 200: return False parse_query(source_company_id, r.text) return True
def query_by_company_name(source_company_id, name): if name is None or name == "": return True #不是正常的中国公司名 if name.find(".") != -1: return True result = conn.get("select count(*) cnt from source_domain where sourceCompanyId=%s", source_company_id) if result["cnt"] > 0: return True name = name.replace("_","") idx = name.rfind(u"公司") if idx != -1: name = name[:(idx+len(u"公司"))] #url = "http://beian.links.cn/beian.asp?beiantype=zbdwmc&keywords=%s" % name url = "http://beian.links.cn/zbdwmc_%s.html" % name (flag, r) = my_request.get(logger, url) #logger.info(r.text) if flag != 0 or r.status_code != 200: return False parse_query(source_company_id, r.text) return True
def fetch_finance(company_key): url = 'https://rong.36kr.com/api/company/' + str(company_key) + '/finance' (flag, r) = my_request.get(logger, url) if flag == 0: finance = json.loads(r.text) return finance
def get_job(company_key, page_no): job_contents = [] items = ['1'] while len(items) > 0 and page_no < 10: job_url = "http://www.jobtong.com/api/enterprises/%s/jobs?page=%s" % ( company_key, page_no) logger.info(job_url) (flag, r) = my_request.get(logger, job_url) if flag == 0: job_data = json.loads(r.text) job_result = job_data['items'] if len(job_result) > 0: for job in job_result: job_content = { "date": datetime.datetime.now(), "source": source, "company_key": company_key, "job_key": job['id'], "content": job } job_contents.append(job_content) items = job_result page_no += 1 return job_contents
def fetch_status(company_key): url = 'https://rong.36kr.com/api/company/' + str(company_key) (flag, r) = my_request.get(logger, url) if flag == 0: status = json.loads(r.text) return status
def fetch_qichacha(company_key): url = 'https://rong.36kr.com/api/company/' + str(company_key) + '/qichacha' (flag, r) = my_request.get(logger, url) if flag == 0: qichacha = json.loads(r.text) return qichacha
def fetch_rong_header(): url = 'https://rong.36kr.com/api/p/sm/seo/fragment/header-footer' (flag, r) = my_request.get(logger, url) if flag == 0: header = json.loads(r.text) return header
def fetch_crowdfunding(cf_key): url = 'https://rong.36kr.com/api/p/crowd-funding/' + str(cf_key) (flag, r) = my_request.get(logger, url) if flag == 0: crowdfunding = json.loads(r.text) return crowdfunding
def find_wechat(name, full_name): url = 'http://weixin.sogou.com/weixin?type=1&query=' + name cnt = 1 while cnt < 100: result = [] proxy = { 'type': 'http', 'anonymity': 'high', 'country': 'cn', 'ping': 5 } my_request.get_single_session(proxy, new=True, agent=False) (flag, r) = my_request.get(logger, url) if flag == 0: if '您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证' in r.text: find = False else: d = pq(r.text) for rt in d('div.wx-rb'): rt = pq(rt) name = rt('.txt-box > h3').text() wechat_id = rt('.txt-box > h4').text() if len(rt('.s-p3')) == 3: brief = rt('.s-p3:eq(0) > .sp-txt').text() verify = rt('.s-p3:eq(1) > .sp-txt').text() name_str = '' for n in name: if n is None or n == ' ': pass else: name_str += n wechat_id = wechat_id[4:] wechat = { 'name': name_str, 'id': wechat_id, 'brief': brief, 'verify_company_name': verify } result.append(wechat) if len(result) > 0: find = True if not find: cnt += 1 else: break wechat = [] for r in result: if r['verify_company_name'] == full_name: wechat.append(r) return wechat
def fetch_leader(cf_key): url = 'http://dj.jd.com/funding/leaderInverstorDetail/'+cf_key+'.html' (flag, r) = my_request.get(logger, url) if flag == 0: if u'东家温馨提示:您查询的内容不存在!' in r.text: return None return r.text
def fetch_founder(company_key): url = 'https://rong.36kr.com/api/company/' + str( company_key) + '/founder?pageSize=1000' (flag, r) = my_request.get(logger, url) if flag == 0: founder = json.loads(r.text) return founder
def fetch_rong_overview(company_key): url = 'https://rong.36kr.com/api/p/sm/seo/summary/rong-company-overview/' + str( company_key) (flag, r) = my_request.get(logger, url) if flag == 0: overview = json.loads(r.text) return overview
def query_by_beianhao(source_company_id, beianhao): if beianhao is None or beianhao == "": return True url = "http://beian.links.cn/beianhao_%s.html" % beianhao (flag, r) = my_request.get(logger, url) #logger.info(r.text) if flag != 0 or r.status_code != 200: return False parse_query(source_company_id, r.text) return True
def fetch_news(url): news_key = url.split('=')[1] logger.info("news_key=%s" % news_key) (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: return r.status_code # print url # print r.url if r.url != url: logger.info("Page Redirect <--") return 302 news_content = { "date": datetime.datetime.now(), "source": source, "url": url, "news_key": news_key, "content": r.text } # save if news_collection.find_one({ "source": source, "news_key": news_key }) is None: news_collection.insert_one(news_content) msg = {"type": "direct_news", "source": source, "news_key": news_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("pencil_news", json.dumps(msg)) return 200
def fetch_dj(): url = 'http://dj.jd.com/' threads = [] (flag, r) = my_request.get(logger, url) if flag == 0: d = pq(r.text) links = d('a') for link in links: link = pq(link) link = link.attr('href') if link is not None and '/funding/details/' in link: print link fetch_project(link) divs = d('.show-text') for div in divs: div = pq(div) link = div('a').attr('href') desc = div('p').attr('title') fetch_desc(link, desc)
def login(): while True: idx = random.randint(0, len(login_users)-1) login_user = login_users[idx] logger.info(login_user) s = my_request.get_http_session(new=True, agent=False) data = { "type":"login", "bind":False, "needCaptcha":False, "username":login_user["name"], "password":login_user["pwd"], "ok_url":"/" } headers = { "Referer":"http://passport.36kr.com" } try: r = s.post("http://passport.36kr.com/passport/sign_in",data=data, headers=headers, timeout=10) logger.info(r.text) except: continue if r.status_code != 200: continue if r.text.strip() != '{"redirect_to":"/"}': continue (flag, r) = my_request.get(logger,"http://uc.36kr.com/api/user/identity") if flag == 0 and r is not None and r.status_code==200: result = r.json() logger.info(result) if result["code"] == 4031: break
def fetch_project(url): (cf_key, ) = util.re_get_result("http://dj.jd.com/funding/details/(\d+).html", url) (flag, r) = my_request.get(logger, url) if flag == 0: html = r.text support = fetch_support(cf_key) focus = fetch_focus(url, cf_key) team = fetch_team(cf_key) leader = fetch_leader(cf_key) bp = fetch_bp(html, url, cf_key) content = {'html': html, 'team': team, 'support': support, 'focus': focus } project = {"date":datetime.datetime.now(), "source":source, "url":url, "company_key": cf_key, "cf_key":cf_key, "content":content, 'leader': leader, 'bp': bp } result = cf_collection.find_one({"source":source, "company_key":cf_key, 'cf_key': cf_key}) if result != None: cf_collection.replace_one({'_id': result['_id']}, project) else: cf_collection.insert_one(project) msg = {"type":"cf", "source":source, "cf_key":cf_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("crawler_cf_jd_v2", json.dumps(msg))
def fetch(url): (key, ) = util.re_get_result("https://itjuzi.com/album/(\d+)", url) logger.info("key=%s" % key) (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: return r.status_code if r.url != url: logger.info("Page Redirect <--") return 302 content = { "date": datetime.datetime.now(), "url": url, "key": key, "content": r.text } # save if collection.find_one({"key": key}) != None: collection.delete_one({"key": key}) collection.insert_one(content) # msg = {"type":"itjuzi_album", "key":key} # logger.info(json.dumps(msg)) # kafka_producer.send_messages("itjuzi_album", json.dumps(msg)) return 200
def fetch_support(cf_key): url = 'http://dj.jd.com/funding/selectSupportCount.action?projectId='+cf_key+'&minimumAmount=100,000&silkmumAmount=10,000' (flag, r) = my_request.get(logger, url) if flag == 0: return r.text
def fetch_team(cf_key): url = "http://dj.jd.com/funding/findProjectTeam.action?projectId="+cf_key (flag, r) = my_request.get(logger, url) if flag == 0: return r.text
def fetch_company(url): (company_key, ) = util.re_get_result("http://www.jobtong.com/e/(\d+)", url) if company_collection.find_one({ "source": source, "company_key": company_key }) != None: return 200 logger.info("company_key=%s" % company_key) (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: return r.status_code if r.url != url: logger.info("Page Redirect <--") return 302 company_content = { "date": datetime.datetime.now(), "source": source, "url": url, "company_key": company_key, "company_key_int": int(company_key), "content": r.text } doc = lxml.html.fromstring(r.text) #company invalid #job job_contents = get_job(company_key, 1) if len(job_contents) > 0: #save if company_collection.find_one({ "source": source, "company_key": company_key }) != None: company_collection.delete_one({ "source": source, "company_key": company_key }) company_collection.insert_one(company_content) for job in job_contents: if job_collection.find_one({ "source": source, "company_key": company_key, "news_key": job["job_key"] }) == None: job_collection.insert_one(job) msg = {"type": "company", "source": source, "company_key": company_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("crawler_recruit_jobtong", json.dumps(msg)) return 200 else: return 302
def fetch_cf(data): # sleep_time = random.randint(10, 30) # time.sleep(sleep_time) cf_key = data['id'] company_key = data['company_id'] logger.info("cf_key=%s" % cf_key) url = 'https://rong.36kr.com/company/' + str( company_key) + '/crowFunding?fundingId=' + str(cf_key) (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 if r.status_code == 404: return r.status_code if r.status_code != 200: return r.status_code html = r.text finance = fetch_finance(company_key) crowdfunding = fetch_crowdfunding(cf_key) overview = fetch_rong_overview(company_key) # header = fetch_rong_header() qichacha = fetch_qichacha(company_key) founder = fetch_founder(company_key) status = fetch_status(company_key) content = { 'html': html, 'finance': finance, 'crowdfunding': crowdfunding, 'overview': overview, # 'header': header, 'qichacha': qichacha, 'founder': founder, 'status': status } cf_content = { "date": datetime.datetime.now(), "source": source, "url": url, "company_key": company_key, "cf_key": cf_key, "content": content } result = cf_collection.find_one({ "source": source, "company_key": company_key, 'cf_key': cf_key }) if result != None: cf_collection.replace_one({'_id': result['_id']}, cf_content) else: cf_collection.insert_one(cf_content) msg = {"type": "cf", "source": source, "cf_key": cf_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("crawler_cf_36kr_v2", json.dumps(msg))
r"http://www.itjuzi.com/overview/news/(\d*)$", news_url) (news_year, news_month, news_day) = util.re_get_result( r'^(\d*)[^\d]*(\d*)[^\d]*(\d*)[^\d]*', news_date_str) logger.info(news_title) logger.info(news_url) logger.info(news_source_domain) logger.info(news_date_str) logger.info("%s-%s-%s" % (news_year, news_month, news_day)) logger.info(news_key) if news_collection.find_one({ "source": source, "company_key": company_key, "news_key": news_key }) == None: (flag, r) = my_request.get(logger, news_url) if flag == -1: continue if r.status_code != 200: continue #logger.info(r.text) f = pq(r.text) url = f('iframe').attr("src").strip() (flag, r) = my_request.get_no_sesion(logger, url) if flag == -1: continue if r.status_code != 200: continue #logger.info(r.text) news_contents.append({
def fetch_company(url): (company_key, ) = util.re_get_result("http://rong.36kr.com/api/company/(\d+)", url) logger.info("company_key=%s" % company_key) company_content = None member_contents = [] news_contents = [] investor_contents = [] member_ids = [] #company base info time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: logger.info("status_code=%d" % r.status_code) return r.status_code company_base = r.json() logger.info(company_base) if company_base["code"] != 0: return 404 logger.info(company_base["data"]["company"]["name"]) #past-finance (investment events) url = "http://rong.36kr.com/api/company/%s/past-finance" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 past_finance = r.json() #past-investor url = "http://rong.36kr.com/api/company/%s/past-investor?pageSize=100" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 past_investor = r.json() #funds (非投资人没有查看权限) url = "http://rong.36kr.com/api/company/%s/funds" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 funds = r.json() #product url = "http://rong.36kr.com/api/company/%s/product" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 product = r.json() #past-investment url = "http://rong.36kr.com/api/company/%s/past-investment" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 past_investment = r.json() #company-fa? url ="http://rong.36kr.com/api/fa/company-fa?cid=%s" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 company_fa = r.json() #founders url = "http://rong.36kr.com/api/company/%s/founder?pageSize=1000" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 founders = r.json() #employee url ="http://rong.36kr.com/api/company/%s/employee?pageSize=1000" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 employees = r.json() #former-member url = "http://rong.36kr.com/api/company/%s/former-member?pageSize=1000" % company_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 former_members = r.json() company_content = {"date":datetime.datetime.now(), "source":source, "url":url, "company_key":company_key, "company_key_int":int(company_key), "company_base":company_base, "past_finance":past_finance, "past_investor":past_investor, "funds":funds, "product":product, "past_investment":past_investment, "company_fa":company_fa, "founders":founders, "employees":employees, "former_members":former_members} #member for m in founders["data"]["data"]: m_id = m["id"] member_ids.append(m_id) for m in employees["data"]["data"]: m_id = m["id"] member_ids.append(m_id) for m in former_members["data"]["data"]: m_id = m["id"] member_ids.append(m_id) for v in past_investor["data"]["data"]: if v["entityType"] == "INDIVIDUAL": m_id = v["entityId"] member_ids.append(m_id) for m_id in member_ids: member_key = str(m_id) if member_collection.find_one({"source":source, "member_key":member_key}): continue #basic url = "http://rong.36kr.com/api/user/%s/basic" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_base = r.json() #past-investment url = "http://rong.36kr.com/api/user/%s/past-investment" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_past_investment = r.json() # url = "http://rong.36kr.com/api/user/%s/company" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_company = r.json() # url = "http://rong.36kr.com/api/user/%s/work" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_work = r.json() # url = "http://rong.36kr.com/api/p/lead-investor/%s/financing" % member_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 member_financing = r.json() member_content = {"date":datetime.datetime.now(), "source":source, "url":url, "member_key":member_key, "member_base":member_base, "member_past_investment":member_past_investment, "member_company":member_company, "member_work":member_work, "member_financing":member_financing} member_contents.append(member_content) #investor organization for e in past_finance["data"]["data"]: for investor in e.get("participants",{}): investor_key = str(investor["entityId"]) if investor_collection.find_one({"source":source, "investor_key":investor_key}): continue #base info url = "http://rong.36kr.com/api/organization/%s/basic" % investor_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 investor_base = r.json() #staffs url = "http://rong.36kr.com/api/organization/%s/user" % investor_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 staffs = r.json() #former-member url = "http://rong.36kr.com/api/organization/%s/former-member" % investor_key time.sleep(5) (flag, r) = my_request.get(logger, url) if flag == -1: return -1 former_members = r.json() investor_content = {"date":datetime.datetime.now(), "source":source, "url":url, "investor_key":investor_key, "investor_base":investor_base, "staffs":staffs, "former_members":former_members} investor_contents.append(investor_content) #logger.info(company_content) #logger.info("************") #logger.info(member_contents) #logger.info("************") #logger.info(investor_contents) #save if company_collection.find_one({"source":source, "company_key":company_key}) != None: company_collection.delete_one({"source":source, "company_key":company_key}) company_collection.insert_one(company_content) for member in member_contents: if member_collection.find_one({"source":source, "member_key":member["member_key"]}) == None: member_collection.insert_one(member) for news in news_contents: if news_collection.find_one({"source":source, "company_key":company_key, "news_key":news["news_key"]}) == None: news_collection.insert_one(news) for investor in investor_contents: if investor_collection.find_one({"source":source, "investor_key":investor["investor_key"]}) == None: investor_collection.insert_one(investor) msg = {"type":"company", "source":source, "company_key":company_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("crawler_kr36_v2", json.dumps(msg)) return 200
return False if __name__ == "__main__": (logger, mongo, kafka_producer, company_collection, member_collection, news_collection, cf_collection) \ = spider_util.spider_cf_init('jd') login() flag = True while flag: i = 1 url = 'https://rong.36kr.com/api/p/crowd-funding?page=' + str( i) + '&per_page=100&status=all' (flag, r) = my_request.get(logger, url) if flag == 0: data = json.loads(r.text) data = data['data'] last_page = data['last_page'] if last_page > 1: for i in xrange(0, last_page): url = 'https://rong.36kr.com/api/p/crowd-funding?page=' + str( i) + '&per_page=100&status=all' (flag, r) = my_request.get(logger, url) if flag == 0: data = json.loads(r.text)['data']['data'] for d in data: fetch_cf(d) flag = False else:
def _get(url, cleanup): data = my_request.get(url, True) if data == None: return None return cleanup(data)
def find_weibo(name): url = 'http://s.weibo.com/user/&work=' + name cnt = 1 while cnt < 100: result = [] proxy = { 'type': 'http', 'anonymity': 'high', 'country': 'cn', 'ping': 5 } my_request.get_single_session(proxy, new=True, agent=False) (flag, r) = my_request.get(logger, url) if flag == 0: d = pq(r.text) find = True for s in d('script'): s = pq(s).text() s = ''.join(s) if 'STK && STK.pageletM && STK.pageletM.view' in s: s = s.replace('STK && STK.pageletM && STK.pageletM.view(', '') s = s[0:len(s) - 1] data = json.loads(s) html = data['html'] if '你的行为有些异常,请输入验证码' in html: find = False result.append(html) if not find or len(result) == 0: cnt += 1 else: break data = result[2] d = pq(data) result = [] for li in d('div.list_person'): li = pq(li) verify = li('p.person_name > a:eq(1)').attr('title') if verify == '微博机构认证': name = li('p.person_name > a:eq(0)').text() uid = li('p.person_name > a:eq(0)').attr('uid') location = li('p.person_addr > span:eq(1)').text() link = li('p.person_addr > a').text() follow = li('p.person_num > span:eq(0) > a').text() fans = li('p.person_num > span:eq(1) > a').text() publish = li('p.person_num > span:eq(2) > a').text() desc = li('div.person_info > p').text().replace('简介:', '').strip() tags = [] for tag in li('p.person_label:eq(0) > a'): tag = pq(tag).text() tags.append(tag) tags = ','.join(tags) verify_company_name = li('person_label:eq(1) > a').text() account = { 'name': name, 'uid': uid, 'location': location, 'link': link, 'follow': follow, 'fans': fans, 'publish': publish, 'desc': desc, 'tags': tags, 'verify_company_name': verify_company_name } result.append(account) weibo = [] for r in result: if r['verify_company_name'] == name or r['name'] == name: weibo.append(r) return weibo
def fetch_job(url): urlarr = url.split("=") job_key = urlarr[len(urlarr) - 1] logger.info("job_key=%s" % job_key) (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: #logger.info(r.status_code) return r.status_code if r.url != url: logger.info("Page Redirect <--") return 302 doc = lxml.html.fromstring(r.text) company_url = doc.xpath('//div[@class="c_name"]/a/@href') if len(company_url) == 0: return 200 company_url = company_url[0] (company_key, ) = util.re_get_result("/company/detail/domain=(\S+).html", company_url) logger.info(company_key) job_content = { "date": datetime.datetime.now(), "source": source, "url": url, "company_key": company_key, "job_key": int(job_key), "content": r.text } result = job_collection.find_one({ "source": source, "company_key": company_key, "job_key": job_key }) if result == None: job_collection.insert_one(job_content) # else: # job_collection.replace_one({"_id":result["_id"]}, job_content) if company_collection.find_one({ "source": source, "company_key": company_key }) == None: company_url = 'http://www.neitui.me/' + company_url (flag, r) = my_request.get(logger, company_url) if flag == -1: return -1 company_content = { "date": datetime.datetime.now(), "source": source, "url": company_url, "company_key": company_key, "content": r.text } company_collection.insert_one(company_content) msg = {"type": "job", "source": source, "job_key": job_key} logger.info(json.dumps(msg)) kafka_producer.send_messages("crawler_recruit_neitui", json.dumps(msg)) return 200
def fetch_company(url): (company_key, ) = util.re_get_result("http://www.itjuzi.com/company/(\d+)", url) logger.info("company_key=%s" % company_key) company_content = None member_contents = [] news_contents = [] investor_contents = [] (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 logger.info("status=%d", r.status_code) if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: #logger.info(r.status_code) return r.status_code #logger.info(r.text) d = pq(r.text) product_name = d('div.line-title> span> b').clone().children().remove( ).end().text().strip() if product_name == "": return 404 company_content = { "date": datetime.datetime.now(), "source": source, "url": url, "company_key": company_key, "company_key_int": int(company_key), "content": r.text } #members lis = d('h4.person-name> a.title') for li in lis: try: l = pq(li) href = l.attr("href").strip() logger.info(href) member_name = l('b> span.c').text().strip() (member_key, ) = util.re_get_result( r'http://www.itjuzi.com/person/(\d*?)$', href) logger.info("member_key=%s, member_name=%s" % (member_key, member_name)) href = href.replace("http://", "https://") #if member_collection.find_one({"source":source, "member_key":member_key}) == None: if 1 == 1: flag = -1 while flag != 0: (flag, r) = my_request.get(logger, href) if flag == 0 and r.status_code != 200: flag = -1 if flag != 0: my_request.get_https_session(new=True, agent=True) #logger.info(r.text) member_contents.append({ "date": datetime.datetime.now(), "source": source, "url": url, "member_key": member_key, "member_name": member_name, "content": r.text }) except Exception, ex: logger.exception(ex)