def run_company(url, tp): # 针对桔子对公司的爬取限制,另外写的 url2 = url page = 10 flag = True keyword_all = [] l_list = range(2, 2427) l_list2 = copy.copy(l_list) for i in l_list2: store_path1 = os.path.join(common.root_path, 'juzi', str(i) + '.html') if os.path.isfile(store_path1): l_list.remove(i) else: print store_path1 print len(l_list), 999999999 print l_list aa = common.get_request(url, timeout=18) headers2 = { 'Origin': 'http://www.itjuzi.com', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1', 'Accept': '*/*', 'Referer': 'http://www.itjuzi.com/company?page=2410', 'Cookie': 'grwng_uid=2e824b08-70ef-41f7-a9c8-62ff25d8f920; AWSELB=258D9D590E00B3DE939BD2301A2166BB8314D5BFDDA88D29F0E3F22E0935E83EF1C408A6B613204775BA26EA9BE8555ABB5A13289EDD9FCE01B44987A799A50A15E49578ED9A0D7D28BE3696012F59FED65EA97193', 'Connection': 'keep-alive' } url_p = 'https://api.growingio.com/v2/eee5a46c52000d401f969f4535bdaa78/web/pv?stm=1459928218615' while flag: print url keyword_list = extract(aa.text, tp) print keyword_list print '---------------' keyword_all.extend(keyword_list) print keyword_all headers['refer'] = url url = url2 + "?page={}".format(page) print url headers2['refer'] = url aa = common.get_request(url, timeout=18) # bb = common.post_request(url_p, headers=headers2, data='6\x86\xf0D\x08`\x96`\\`S$\x15\x82\x01`\x1b\x01\x8d\x90&\x002\x10\x09\x9a\xf8\x08\xc0\x19\x80\x9c\x19QZ\xc8\x0c\xcc\x80FDA\x00\xec\x00q\x80\r$\x00np\xc3\xe0\x07H\xd7\x80\x96"\x03\xa8!\x90 +\x88\xeeX+w\xcb\x8b.\x00\xb4dru\xd6\x84\xd1]\x11\x91`\x8b\xa7\x91\n\x17\xf0\xf5\xcd\xca\xbf0\x01\x9dUc)\xd7\'df\xdchDd\xc6\xa4\x8c\xba\xdc\x08&\xba\x92\x08DATT\xf8\xc8\xf8R`\x00."\x82\xeeY\x02\x19\x00\xb6pd\x0cI\xce\xb8d\xdc\xc8<\x02\xee\x00\x16p\x9c\x8c\x18\xb5\x00\xeeph%\x02D"\xad}bP\x19\x00VJ\x00^PbX\x00\xf6E\x02\x00\x0e"\x00\xf4\xd3\x05s\x10\x00v\x00\x9en\x00N\x14"u\x19\x19s\xb0\x8b\x8b}\xad\x03\xc3c\x13+\xcb3k[\x00\xfck\x00\xe6\x08\x00\xbc\xb8\x9dhn\x006"Q\x9dWE\x87Y\xb8\x00\x8e"7\xa7\xdb\xe6G\xc1\x80\x00\xbe|p\n\x9e\x06\xa0\xd1ht\xfaC\x18X.d\xb3Yl\xf6T\x93\x85\xc6\xe4\xf1\xa3\xbc\xbe\x7f X*\x113\xe0"Q\x18\x9cA&\x82H\xa4\xd2n\x0c\x80>\x08\x01\xb4T\x00\x7f*\x01\xd0\x95\x00\x16\x11\x80W\x0c\x80\x01\x00\x07\xc2P\x04\x90\x00\xaa\x00T-\x00\n\xda\xdc\xa8D\x1d\xed\xca*\xc0J\xc82\xb7\x02\xa5Q\xa9\x80\xe6Yx!\xd8\xe6\xe1\xeb\xc0.W\x11\xb8\xd2c3p-\xe0\xf7U\x86\xdb`#\xd8\x1c\x8e\'3\xad\xb0om\xb8\xcc\xdd\x8fM\x8b\xc3S\x09\xf8C\xd5\xef/\xa7^\x10\x88\x02\xe9\x00') # print bb.content common.get_request('http://www.itjuzi.com/company/{}'.format( random.choice(range(1, 500))), timeout=10) store_path = os.path.join(common.root_path, 'juzi', str(page) + '.html') with open(store_path, 'w+') as f: f.write(aa.text.encode('utf8')) common.rand_sleep(30, 15) if len(l_list) == 0: flag = False else: page = random.choice(l_list) l_list.remove(page) return keyword_all
def get_page_one(url): r2 = common.get_request(url) cc = content(r2.text) # print cc if morepage(r2.text): common.rand_sleep(3, 2) r3 = common.get_request(url + '?p=1') # 假如有两页 logger.info('{} has two page, try to get page one'.format(url)) cc += content(r3.text) dd = common.re_email(cc) print dd ee = list(set(dd)) ff = ','.join(ee) return ff
def get_company(company_id): cwd_abs = os.path.abspath(__file__) cwd = os.path.dirname(cwd_abs) # for i in xrange(1, 120000): print company_id # if not sql_lg('lagou', company_id): if True: url = 'http://www.lagou.com/gongsi/{}.html'.format(company_id) print url r = common.get_request(url, headers=header) # print r.url if r.status_code == 200: print url, '------------------' * 5 #store_path = os.path.join(cwd,keyword,fname) # gs_fp = os.path.join(cwd, 'gongsi', 'lagou') # if not os.path.exists(gs_fp): # os.makedirs(gs_fp) # # fname = str(company_id) + '.html' # job_id = str(company_id) # job_id = job_id.rjust(8, '0') # store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html') # father_dir=os.path.dirname(store_path) # if not os.path.exists(father_dir): # os.makedirs(father_dir) # with open(store_path, 'w+') as f: # f.write(r.text) company_dict = company_parse(r.text) sql_lg_main('lagou', job_dict=company_dict, url=url, company_id=company_id)
def get_page(url): # s = requests.session() # r1 = s.get(url) r1 = common.get_request(url) r1.encoding = 'gb2312' soup = BeautifulSoup(r1.text, 'html.parser') job_num = soup.find('input', {'name': 'hidTotal'}).get('value') job_list = [] job_1 = soup.find_all('p', {'class': 't1'}) for i in job_1: job_list.append(i.a.get('href')) while len(job_list) < int(job_num): payload = {'pageno': 2, 'hidTotal': job_num} r2 = common.post_request(url, data=payload) r2.encoding = 'gb2312' soup2 = BeautifulSoup(r2.text, 'html.parser') job_2 = soup2.find_all('p', {'class': 't1'}) for i2 in job_2: job_list.append(i2.a.get('href')) payload['pageno'] += 1 print job_list print len(job_list), job_num return job_list
def get_url_all(url): r = common.get_request(url, headers) url_first = libzlcompany.get_url_list(r.text) url_all = url_first flag = 1 while flag: url_next = libzlcompany.find_next(r.text) # print url_next, 22222222222222 if url_next: url_next_list = libzlcompany.get_url_list(r.text) url_all.extend(url_next_list) r = common.get_request(url_next) else: flag = 0 # print url_all, len(url_all), 111111111111 return url_all
def get_company(company_id): cwd_abs = os.path.abspath(__file__) cwd = os.path.dirname(cwd_abs) # for i in xrange(1, 120000): print company_id if not sql_lg('lagou', company_id): url = 'http://www.lagou.com/gongsi/{}.html'.format(company_id) print url r = common.get_request(url, headers=header) # print r.url if r.status_code == 200: print url, '------------------' * 5 #store_path = os.path.join(cwd,keyword,fname) gs_fp = os.path.join(cwd, 'gongsi', 'lagou') if not os.path.exists(gs_fp): os.makedirs(gs_fp) # fname = str(company_id) + '.html' job_id = str(company_id) job_id = job_id.rjust(8, '0') store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html') father_dir=os.path.dirname(store_path) if not os.path.exists(father_dir): os.makedirs(father_dir) with open(store_path, 'w+') as f: f.write(r.text) company_dict = company_parse(r.text) sql_lg_main('lagou', job_dict=company_dict, url=url, company_id=company_id)
def kword(blog_id, blog_app, post_id): par = {'blogApp': blog_app, 'blogId': blog_id, 'postId': post_id} url = 'http://www.cnblogs.com/mvc/blog/CategoriesTags.aspx' keyword = '' try: ab = common.get_request(url, params=par) result = ab.json() tag = result['Tags'] tag = tag[tag.find(':')+2:] cate = result['Categories'] cate = cate[cate.find(':')+2:] so1 = '' so2 = '' try: so1 = BeautifulSoup(tag, 'html.parser').get_text() except: pass try: so2 = BeautifulSoup(cate, 'html.parser').get_text() except: pass keyword = so1 + ',' + so2 # print keyword, 999999999999999999999 except: pass return keyword
def kword(blog_id, blog_app, post_id): par = {'blogApp': blog_app, 'blogId': blog_id, 'postId': post_id} url = 'http://www.cnblogs.com/mvc/blog/CategoriesTags.aspx' keyword = '' try: ab = common.get_request(url, params=par) result = ab.json() tag = result['Tags'] tag = tag[tag.find(':') + 2:] cate = result['Categories'] cate = cate[cate.find(':') + 2:] so1 = '' so2 = '' try: so1 = BeautifulSoup(tag, 'html.parser').get_text() except: pass try: so2 = BeautifulSoup(cate, 'html.parser').get_text() except: pass keyword = so1 + ',' + so2 # print keyword, 999999999999999999999 except: pass return keyword
def get_destination_url_path(request=None): """Get the (effective, sans any "traversal namespace notation" components and other such "traversal processing instruction" url components) target URL path of the (current) request. """ if request is None: request = common.get_request() #_url = request.URL #_url = request.getURL(level=0, path_only=True) # NOTE: both URL and getURL() depend on where we are in the traversal # process i.e. they return the *currently* traversed URL path and not # the full requested path. # # So, we use the request's PATH_INFO but as this may contain: # - (++) any number of Zope "traversal namespace notation" url components # - (@@/) to indicate that the URL is for an object that is a resource # - (@@)) to indicate a view name # we need to get rid # of them: _url = "/".join([ url_component for url_component in request.get("PATH_INFO").split("/") if not url_component.startswith("++") and not url_component.startswith("@@") ]) log.debug(" [get_destination_url_path] %s " % _url) return _url
def get_company(company_id): cwd_abs = os.path.abspath(__file__) cwd = os.path.dirname(cwd_abs) # for i in xrange(1, 120000): url = 'http://www.lagou.com/gongsi/{}.html'.format(company_id) print url r = common.get_request(url, headers=lagouall.header) # print r.url if r.status_code == 200: print url, '------------------' * 5 #store_path = os.path.join(cwd,keyword,fname) # gs_fp = os.path.join(cwd, 'gongsi', 'lagou') # if not os.path.exists(gs_fp): # os.makedirs(gs_fp) # # fname = str(company_id) + '.html' # job_id = str(company_id) # job_id = job_id.rjust(8, '0') # store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html') # father_dir=os.path.dirname(store_path) # if not os.path.exists(father_dir): # os.makedirs(father_dir) # with open(store_path, 'w+') as f: # f.write(r.text) company_dict = lagouall.company_parse(r.text) return company_dict
def run_work(url): cwd_abs = os.path.abspath(__file__) cwd = os.path.dirname(cwd_abs) payload = company_payload(url) job_list = get_job_list(payload) for job_id in job_list: job_url = 'http://www.lagou.com/jobs/' + str(job_id) + '.html' print job_url if not common.sql_select('lagou', job_id): r = common.get_request(job_url) ## if r.status_code == 200: ## r.encoding = 'utf-8' ## job_dict = liblagoucompany.extract2(r.text) ## common.sql_main('lagou', job_dict, job_url, job_id) ## gs_fp = os.path.join(cwd, 'jobs', 'lagou') ## if not os.path.exists(gs_fp): ## os.makedirs(gs_fp) ## job_id = str(job_id).rjust(9, '0') ## store_path = os.path.join(gs_fp,job_id[0:3], job_id[3:6], job_id +'.html') ## father_dir=os.path.dirname(store_path) ## if not os.path.exists(father_dir): ## os.makedirs(father_dir) ## with open(store_path, 'w+') as f: ## f.write(r.text) ## common.rand_sleep(1) if r.status_code == 200: r.encoding = 'utf-8' job_dict = liblagoucompany.extract2(r.text) common.sql_main('lagou', job_dict, job_url, job_id)
def main(source): url = url_cr(source) logging.debug('chuansong url is {}'.format(url)) r = common.get_request(url) if r: html = r.text one_page(html, source) try: url2 = next_page(html) logging.debug('page 2 url is {}'.format(url2)) while url2: r2 = common.get_request(url2) html2 = r2.text one_page(html2, source) url2 = next_page(html2) except Exception as e: logging.error('err get next page msg is {}'.format(e), exc_info=True)
def main(blog_name): sql_name = 'cnblog_' + blog_name page = 1 flag = True url_0 = "http://www.cnblogs.com/{}/".format(blog_name) url_1 = "http://www.cnblogs.com/{}/".format(blog_name) while flag: print url_1 try: bb = common.get_request(url_1) logging.info('return url {} success '.format(bb.url)) print bb.url soup_2 = BeautifulSoup(bb.text, 'html.parser') with open('asdf.html', 'w+') as f: f.write(bb.text.encode('utf8')) b2 = soup_2.find_all( 'a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')}) # 某页的文章链接 for i_text in b2: article_url = i_text.get('href') print article_url logging.info('article is {}'.format(article_url)) article_title = i_text.get_text().strip() if not common.select(article_url, blog_name): article = common.get_request(article_url) pub_time = common.re_time(article.text) keyword, content = extract(article.text) blog_id, blog_app, post_id = blog_info(article.text) keyword = kword(blog_id, blog_app, post_id) common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = url_0 + r'default\S+page={}'.format(page) print re_str pp = re.compile(re_str) ppp = re.search(pp, bb.text) if ppp is None: flag = False else: url_1 = ppp.group() common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def main(job_list, option=0): """会更新旧的岗位信息 option=0 只抓取新增加的 option=1""" for url in job_list: job_id = re.search('[0-9]+.html', url).group()[:-5] if option == 0: r1 = common.get_request(url) r1.encoding = 'gb2312' job_dict = html_extract.extract_51(r1.text) # job_id = re.search('[0-9]+.html', url).group()[:-5] common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id) if option == 1: if not common.sql_select('job51', job_id): r1 = common.get_request(url) r1.encoding = 'gb2312' job_dict = lib51company.extract2(r1.text) # job_id = re.search('[0-9]+.html', url).group()[:-5] common.sql_main(source='job51', job_dict=job_dict, url=url, job_id=job_id)
def main(): source = 'mux' page = 1 flag = True url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page) while flag: try: print url res = common.get_request(url) logging.info('return url {} success'.format(res.url)) print res.url soup = BeautifulSoup(res.text, 'html.parser') with open('temp.html', 'w+') as f: f.write(res.text.encode('utf8')) articles = soup.find_all('div', class_='artical_inner') for item in articles: contents = item.contents article_url = contents[9].a.get('href') article_title = str(contents[3].a.get('title')).strip() if not common.select(article_url, source): pub_time = time.strftime('%Y-%m-%d',\ time.strptime(str(contents[5].get_text()).split('|')[-1].strip(), '%Y年%m月%d日')) keyword = str( contents[5].get_text()).split('|')[-2].strip() content = get_content( common.get_request(article_url).text) print article_title common.sql_insert(source, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = r'http://mux.baidu.com/\?page_id=10\S+paged={}'.format( page) pat = re.compile(re_str) s_r = re.search(pat, res.text) if s_r is None: flag = False else: url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page) common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def main(blog_name): sql_name = 'cnblog_' + blog_name page = 1 flag = True url_0 = "http://www.cnblogs.com/{}/".format(blog_name) url_1 = "http://www.cnblogs.com/{}/".format(blog_name) while flag: print url_1 try: bb = common.get_request(url_1) logging.info('return url {} success '.format(bb.url)) print bb.url soup_2 = BeautifulSoup(bb.text, 'html.parser') with open('asdf.html', 'w+') as f: f.write(bb.text.encode('utf8')) b2 = soup_2.find_all('a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')}) # 某页的文章链接 for i_text in b2: article_url = i_text.get('href') print article_url logging.info('article is {}'.format(article_url)) article_title = i_text.get_text().strip() if not common.select(article_url, blog_name): article = common.get_request(article_url) pub_time = common.re_time(article.text) keyword, content = extract(article.text) blog_id, blog_app, post_id = blog_info(article.text) keyword = kword(blog_id, blog_app, post_id) common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = url_0 + r'default\S+page={}'.format(page) print re_str pp = re.compile(re_str) ppp = re.search(pp, bb.text) if ppp is None: flag = False else: url_1 = ppp.group() common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def one_page(html, source): ll = link_list(html) common.rand_sleep(3, 1) for i in ll: url, title = i.split(',') logging.debug('next url is {}'.format(url)) if not sql_se(source, title): r2 = common.get_request(url) title2, content, pub_time= page_parse(r2.text) common.sql_insert(source, url, title, content, pub_time, '') common.rand_sleep(6, 2)
def get_first(keyword = 'android', payload = payload1): url_list = [] payload['keyword'] = keyword r = common.get_request(url, params=payload, cookies=cookies_dict) # 保存当前网址 #print r.text url_list = find_url(r.text) with open('company.html', 'w+') as f: f.write(r.text.encode('ISO-8859-1')) return r.text, url_list
def run_work(keyword = 'python'): url_list = get_url_list(keyword) for url_get in url_list: print url_get job_id = re.search('[0-9]+.html', url_get).group()[:-5] print job_id if not common.sql_select('job51', job_id): r = common.get_request(url_get) r.encoding = 'gb2312' job_dict = extract2(r.text) common.sql_main('job51', job_dict, url_get, job_id)
def run_work(url): cid = company_id(url) job_l = job_list(cid) for job_id in job_l: job_url = 'http://www.cjol.com/jobs/job-' + job_id print job_url print job_id if not common.sql_select('cjol', job_id): r = common.get_request(job_url) r.encoding = 'utf-8' job_dict = libcjolcompany.extract2(r.text) common.sql_main('cjol', job_dict, job_url, job_id)
def run_work(curl): url_all = get_url_all(curl) for url_get in url_all: print url_get job_id = re.search('[0-9]+.htm', url_get).group()[:-5] print job_id if not common.sql_select('zhilian', job_id): print common.sql_select('zhilian', job_id) r = common.get_request(url_get) r.encoding = 'utf-8' job_dict = libzlcompany.extract(r.text) common.sql_main('zhilian', job_dict, url_get, job_id)
def all_blog(): url_cnblog = 'http://www.cnblogs.com/AllBloggers.aspx' aa = common.get_request(url_cnblog) soup_a = BeautifulSoup(aa.text, 'html.parser') aa = soup_a.find_all('td') aa = aa[1:] a_dict = dict() for i in aa: blog_url = i.a.get('href') blog_name = blog_url[blog_url.find('com/') + 4:-1] blog_cnname = i.a.get_text().strip() a_dict.update({blog_name: blog_cnname}) return a_dict
def run1(): db = MySQLdb.connect(**common.sql_config) cursor = db.cursor(MySQLdb.cursors.SSCursor) sql_1 = """select id, url, content from news """ cursor.execute(sql_1) print cursor.rowcount i = 0 row = True row = cursor.fetchone() while row is not None: i += 1 if i % 100 == 0: print i, 666666666666666 row = cursor.fetchmany(size=500) # print row for row_id, url, content in row: # print row_id if comb(content, 250) and 'v2ex.com' not in url: # print content, 111111111111111111111 r = common.get_request(url) if r.url.startswith('http://mp.weixin.qq.com/'): soup2 = BeautifulSoup(r.text, 'html.parser') title = soup2.find('title').get_text().encode('utf8') content = soup2.find('div', {'class': 'rich_media_content'}) content = unicode(content).encode('utf8') else: content = Document(r.text.encode( r.encoding, 'ignore')).summary().encode('utf-8') title = Document(r.text.encode( r.encoding)).short_title().encode('utf-8') db2 = MySQLdb.connect(**common.sql_config) cursor2 = db2.cursor() if not comb(content, 250) and 'mp.weixin.qq.com' in url: sql = """update news set rating = 0, content = '{}' where id = '{}'""".format( db2.escape_string(content), row_id) print 2222222222 else: sql = """update news set rating = -1, content = '{}' where id = '{}' """.format( db2.escape_string(content), row_id) try: cursor2.execute(sql) db2.commit() except Exception, e: print e db2.rollback() db.ping(True) db2.close() print row_id, 777777777777777777777 print url
def run(url, tp): aa = common.get_request(url, timeout=8) url2 = url page = 1 flag = True keyword_all = [] while flag: print url keyword_list = extract(aa.text, tp) print keyword_list print '---------------' keyword_all.extend(keyword_list) print keyword_all if aa.text.find(u'下一页') < 0: flag = False else: page += 1 headers['refer'] = url url = url2 + "?page={}".format(page) print url aa = common.get_request(url, headers=headers, timeout=8) common.rand_sleep(9, 4) return keyword_all
def get_image_address(url): newString = url_to_string(url) image = get_request(url) if str(image).isdigit(): pass else: soup_image = BeautifulSoup(image) address = [] img = soup_image.findAll("a", {'target':'_blank'}) if len(img) : for val in img: address.append(val.get("href")) for val in address: imagesAddress.append(newString+val)
def job_list(c_id): param = { 'CompanyID': c_id, 'PageNo': '1', 'PageSize': '100', } header = { 'Host': 'www.cjol.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML' ', like Gecko) Chrome/47.0.2526.111 Safari/537.36' } curl = 'http://www.cjol.com/jobs/company/joblist' r = common.get_request(curl, params=param, headers=header) job_l = [] soup = BeautifulSoup(r.text, 'html.parser') soupa = soup.find_all('a') for a in soupa: job_id = company_id(a.get('href')) job_l.append(job_id) return job_l
def main(): rootdir = os.getcwd() print rootdir try: company_list_dir = os.path.join(rootdir, 'juzi') # for subdir, dirs, files in os.walk(company_list_dir): # for file in files: # logger.info('current file is {}'.format(file)) # fff = os.path.join(subdir, file) # with open(fff) as f: # ff = f.read() # url_list = find_all_link(ff) # print url_list # for i in url_list: for num in xrange(35770, 36000): i = 'http://www.itjuzi.com/company/' + str(num) try: logger.info('current url is {}'.format(i)) juzi_id = i.replace('http://www.itjuzi.com/company/', '') if not sql_sel(juzi_id): logger.info('try to insert {} into mysql'.format(juzi_id)) gs_fp = os.path.join(rootdir, 'juzicompany') if not os.path.exists(gs_fp): os.makedirs(gs_fp) job_id = str(juzi_id) job_id = job_id.rjust(5, '0') store_path = os.path.join(gs_fp,job_id[0:3], job_id +'.html') father_dir=os.path.dirname(store_path) if not os.path.exists(father_dir): os.makedirs(father_dir) r = common.get_request(i) if r: with open(store_path, 'w+') as f: f.write(r.text) ll = parse_page(r.text) sql_in(juzi_id, ll) common.rand_sleep(5, 2) except: logger.error('something wrong ', exc_info=True) except: logger.error('something wrong ', exc_info=True)
def get_url_list(keyword): url_list = [] for industry in industry_list: payload1['industrytype'] = industry s = requests.Session() first_result = get_first(keyword, payload1) first_page = first_result[0] n = 1 next_url = get_next(first_page) url_list.extend(first_result[1]) while next_url: #print next_url n += 1 fname = 'company'+str(n) + '.html' r = common.get_request(next_url, payload1) #r = s.get(next_url, params = payload1, cookies=cookies_dict) url_list2 = find_url(r.text) url_list.extend(url_list2) next_url = get_next(r.text) url_list = list(set(url_list)) print len(url_list) return url_list
def getpage(page): url = 'http://www.v2ex.com/go/cv?p={}'.format(page) r = common.get_request(url) return r
print title content = aa.description.encode('utf8') if url_list[i]['full'] == 1: if aa.has_key('content'): content = aa.content[0]['value'].encode('utf8') logging.info('{} has full context output'.format(i)) pub_time = aa.published_parsed pub_time = datetime.datetime.fromtimestamp(mktime(pub_time)) print pub_time if not select(url, i): i_num2 += 1 if url_list[i]['full'] != 1: try: if i == 'oschina blog': url_2 = url + '?fromerr=dy4SuBAE' r = common.get_request(url_2) else: r = common.get_request(url) print r.url print r.encoding soup = BeautifulSoup(r.text.encode(r.encoding), 'html.parser') keyword = soup.find('meta', {'name': 'keywords'}) print r.encoding if keyword: keyword = keyword.get('content') keyword = keyword.encode('utf8', 'ignore') else: keyword = '' try: if i == 'phphub': keyword = soup.find('div', {'class': 'meta inline-block'}).a.get_text()
async def verify(repos): requests = [get_request(URL_FORMAT.format(r)) for r in repos] responses = await asyncio.gather(*requests, loop=asyncio.get_running_loop(), return_exceptions=True) return [x['full_name'] for x in responses if 'full_name' in x]
def get_spotify_track_data(title, data, token): """ Try to match tracks with their spotify id. json.dumps gets better match in cases where artist name or track name includes non alphanumeric characters. """ search_endpoint = 'https://api.spotify.com/v1/search?' tracks_data = {} not_found = [] if data: for i in data: artist = json.dumps(urllib.parse.quote_plus(i[0])) track = json.dumps(urllib.parse.quote_plus(i[1])) query = ''.join([ 'q=', 'artist:"', artist, '"+', 'track:"', track, '"&type=track&limit=1' ]) url = search_endpoint + query r = common.get_request(url, {'Authorization': 'Bearer ' + token}) if r.status_code == 200: d = r.json() if d: if d['tracks']['items'] and d['tracks']['total'] > 0: album_info = d['tracks']['items'][0] album_name = album_info['album']['name'] album_id = album_info['album']['id'] album_url = album_info['album']['external_urls'][ 'spotify'] artist_info = album_info['artists'][0] artist_name = artist_info['name'] artist_id = artist_info['id'] artist_url = artist_info['external_urls']['spotify'] track_name = album_info['name'] track_id = album_info['id'] track_uri = album_info['uri'] track_url = album_info['external_urls']['spotify'] tracks_data[track_id] = { 'track_name': track_name, 'track_url': track_url, 'track_uri': track_uri, 'album': { 'album_id': album_id, 'album_name': album_name, 'album_url': album_url }, 'artist': { 'artist_id': artist_id, 'artist_name': artist_name, 'artist_url': artist_url } } # # debugging block # if i[1] not in track_name.lower(): # print('track name difference') # print(i[0], i[1]) # print(url) # # pprint(d) # pprint(tracks_data[track_id]) # print() # if i[0] not in artist_name.lower(): # print('artist name difference') # print(i[0], i[1]) # print(url) # pprint(tracks_data[track_id]) # print() else: print('There was a problem matching track') print(i) print(d) print() not_found.append(i) else: print('There was a problem with the request') print(r) not_found.append(i) if tracks_data: common.save_to_json(tracks_data, './json/' + title + '_data.json') print(len(tracks_data), 'tracks identified') print() if not_found: common.save_to_json(not_found, './json/' + title + '_not_found.json') print(len(not_found), 'unidentified tracks') pprint(not_found) print() return tracks_data
def returnUrlAddress(): html = get_request(domain) urlList = detail_url_list(html) image_address(urlList) return imagesAddress
def company_parse(html): # 解析 拉勾的公司页面 soup = BeautifulSoup(html, 'html.parser') base_info = soup.find('div', {'id': 'basic_container'}) aa = base_info.find_all('li') company_type, company_process, company_size, company_city, company_product, job_num = '', '', '', '', '', '' company_name, company_url, company_word = '', '', '' company_main = soup.find('a', {'class': 'hovertips'}) company_short_name = '' try: company_name = company_main.get('title').strip() company_short_name = company_main.get_text().strip() print company_short_name except: pass try: company_url = company_main.get('href') except: pass try: company_word = soup.find('div', { 'class': 'company_word' }).get_text().strip() except: pass # print company_name, company_url, company_word company_leader = '' soup3 = soup.find_all('p', {'class': 'item_manager_name'}) for i3 in soup3: company_leader += i3.span.text + ',' for i1 in aa: # print i1.i.get('class') if 'type' in i1.i.get('class'): company_type = i1.span.text if 'process' in i1.i.get('class'): company_process = i1.span.text if 'number' in i1.i.get('class'): company_size = i1.span.text if 'address' in i1.i.get('class'): company_city = i1.span.text company_product_soup = soup.find_all('div', {'class': 'product_url'}) for i2 in company_product_soup: company_product += i2.a.text.strip() + ',' soup2 = soup.find('div', {'class': 'company_data'}).find_all('li') job_num, job_percent, job_day, job_feedback, last_login = '', '', '', '', '' logo = '' try: logo_url = soup.find('img', {'alt': u'公司Logo'}).get('src') print logo_url r_img = common.get_request(logo_url) logo = r_img.content.encode('base64').replace('\n', '') except: pass company_tag = soup.find_all('li', {'class': 'con_ul_li'}) tag_str = '' try: tag_str = ','.join([i.get_text().strip() for i in company_tag]) print tag_str except: pass try: company_desc = soup.find('div', { 'class': 'company_intro_text' }).span.get_text().strip() except: company_desc = '' try: job_num = soup2[0].strong.text.strip() except: pass try: job_percent = soup2[1].strong.text.strip() except: pass try: job_day = soup2[2].strong.text.strip() except: pass try: job_feedback = soup2[3].strong.text.strip() except: pass try: last_login = soup2[4].strong.text.strip() except: pass if soup.find('a', {'class': 'identification'}): company_verify = '1' else: company_verify = '0' # print company_type, company_process, company_size, company_city, company_product, company_verify # print job_num, job_percent, job_day, job_feedback, last_login # print company_leader, company_name, company_url, company_word company_dict = { 'company_type': company_type, 'company_process': company_process, 'company_size': company_size, 'company_city': company_city, 'company_product': company_product, 'company_verify': company_verify, 'job_num': job_num, 'job_percent': job_percent, 'job_day': job_day, 'job_feedback': job_feedback, 'last_login': last_login, 'company_leader': company_leader, 'company_name': company_name, 'company_url': company_url, 'company_word': company_word, 'company_tag': tag_str, 'company_short_name': company_short_name, 'company_desc': company_desc, 'logo': logo } # print len(company_dict) return company_dict