def get_article_num(tag_id): num = 0 base_url = 'http://www.huxiu.com/tags/{}.html'.format(tag_id) api_url = 'http://www.huxiu.com/v2_action/tag_article_list' post_data['tag_id'] = tag_id page = 1 try: post_data['page'] = page common.rand_sleep(5, 5) res = s.post(api_url, data=post_data) res_data = json.loads(res.text.encode('utf8')) total_page = res_data['total_page'] if total_page == 1: common.rand_sleep(5, 5) res = s.get(base_url) res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') article_box = soup.find('div', class_='related-article') article_list = article_box.find_all('li') num = len(article_list) else: # 根据页数计算文章数 num = 10 * total_page except Exception, e: # print Exception, e logging.error('run error', exc_info=True) return num
def main(): # 登录 l.login() # 获得主题数据的api地址 base_url = 'http://www.tuicool.com/topics/my_hot?id=1' make_dir(base_path) try: common.rand_sleep(5, 10) res = l.session.get(base_url) logging.info('return url {} success'.format(res.url)) res_data = json.loads(res.text) # 主题分类列表 class_list = res_data['cats'] for class_item in class_list: class_path = base_path + '/' + str(class_item['id'])\ + '_' + class_item['name'].encode('utf8') make_dir(class_path) # 主题列表 topic_list = class_item['items'] for topic in topic_list: topic_path = class_path + '/' + str(topic['id'])\ + '_' + topic['name'].encode('utf8') make_dir(topic_path) get_articles_in_topic(str(topic['id']), topic_path) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def get_article(article_id, abs_file_path): '''获得文章,获取成功返回True,文章不在最近一周内返回False''' article_url = 'http://www.tuicool.com/articles/{}'.format(article_id) try: print article_url common.rand_sleep(5, 10) res = l.session.get(article_url) logging.info('return url {} success'.format(res.url)) soup = BeautifulSoup(res.text, 'html.parser') title = str(soup.find('div', class_='article_detail_bg').find('h1')\ .get_text()) print title pub_time = re.sub(re.compile('时间[\s\S]{2}'), '', \ str(soup.find('span', class_='timestamp').get_text()).strip()) keywords = [str(item.get_text())\ for item in soup.find_all('span', class_='new-label')] content = str(soup.find('div', class_='article_body')) # 只抓最近一周内的文章 timedelta = datetime.date.today()-datetime.datetime\ .strptime(pub_time, '%Y-%m-%d %H:%M:%S').date() if timedelta.days > 7: return False with open(abs_file_path, 'w') as f: f.write('标题:' + title + '\n') f.write('发布时间:' + pub_time + '\n') f.write('关键字:' + ', '.join(keywords) + '\n') f.write('内容:' + content + '\n') return True except Exception, e: print Exception, e logging.error('run error', exc_info=True) return False
def main(): page = 1 tag_url = 'https://segmentfault.com/tags/all?page={}' while 1: cur_url = tag_url.format(page) common.rand_sleep(5, 10) res = s.get(cur_url) soup = BeautifulSoup(res.text, 'html.parser') tags_list = soup.find_all('section', class_='tag-list__item') for tag_section in tags_list: tag_name = tag_section.find('h2').find('a').get_text()\ .encode('utf8').strip() # 判断当前主题是否已经抓取完成 with open('segmentfault_done.txt', 'r') as sdf: content = sdf.read() if content.find(','+tag_name+',') == -1: num = get_article_num(tag_name) with open('segmentfault_tags.txt', 'a') as stf: stf.write(tag_name+':'+str(num)+'\n') else: continue with open('segmentfault_done.txt', 'a') as f: f.write(','+tag_name) # 判断是否有下一页 page += 1 re_str = r'/tags/all\?page={}'.format(page) pat = re.compile(re_str) s_r = re.search(pat, res.text) if s_r is None: break else: continue
def get_article_num(tag_id): num = 0 base_url = 'http://www.huxiu.com/tags/{}.html'.format(tag_id) api_url = 'http://www.huxiu.com/v2_action/tag_article_list' post_data['tag_id'] = tag_id page = 1 try: post_data['page'] = page common.rand_sleep(5, 5) res = s.post(api_url, data=post_data) res_data = json.loads(res.text.encode('utf8')) total_page = res_data['total_page'] if total_page == 1: common.rand_sleep(5, 5) res = s.get(base_url) res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') article_box = soup.find('div', class_='related-article') article_list = article_box.find_all('li') num = len(article_list) else: # 根据页数计算文章数 num = 10*total_page except Exception, e: # print Exception, e logging.error('run error', exc_info=True) return num
def run(page): r = getpage(page) url_list = pageparse(r.text) for i in url_list: url = i.split('#')[0] common.rand_sleep(6, 2) ff = get_page_one(url) if not sql_sel(url): sql_in(url, ff)
def get_articles_in_topic(topic_id, topic_path): '''获得主题下的文章''' tp_base_url = 'http://www.tuicool.com/topics/{}'.format(topic_id)\ + '?st=0&lang=1&pn={}' # 判断当前主题是否已完成抓取 if os.path.exists(topic_path + '/' + 'done'): print str(topic_id) + ': done' return page = 0 while 1: try: cur_url = tp_base_url.format(page) print cur_url common.rand_sleep(5, 10) res = l.session.get(cur_url) with open('temp.html', 'w') as f: f.write(res.text.encode('utf8')) logging.info('return url {} success'.format(res.url)) soup = BeautifulSoup(res.text, 'html.parser') # 获得文章列表 articles_list = soup.find_all('div', class_='single_fake') for article in articles_list: article_id = str(article.find('a', class_='article-list-title')\ .get('href').split('/')[2]) article_title = str(article.find('a', class_='article-list-title')\ .get_text()).strip() # 如果文章不存在 abs_file_path = topic_path + '/' + article_id if not os.path.isfile(abs_file_path): # 取到的文章不符合要求,跳出当前主题 if not get_article(article_id, abs_file_path): # 标记当前主题抓取完成 with open(topic_path + '/' + 'done', 'w') as f: pass return else: continue # 判断是否有下一页 page += 1 cur_url = tp_base_url.format(page) re_str = r'/topics/{}\?st=0&lang=1&pn={}'.format(topic_id, page) pat = re.compile(re_str) s_r = re.search(pat, res.text) if s_r is None: # 标记当前主题抓取完成 with open(topic_path + '/' + 'done', 'w') as f: pass break except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def run_company(url, tp): # 针对桔子对公司的爬取限制,另外写的 url2 = url page = 10 flag = True keyword_all = [] l_list = range(2, 2427) l_list2 = copy.copy(l_list) for i in l_list2: store_path1 = os.path.join(common.root_path, 'juzi', str(i) + '.html') if os.path.isfile(store_path1): l_list.remove(i) else: print store_path1 print len(l_list), 999999999 print l_list aa = common.get_request(url, timeout=18) headers2 = { 'Origin': 'http://www.itjuzi.com', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1', 'Accept': '*/*', 'Referer': 'http://www.itjuzi.com/company?page=2410', 'Cookie': 'grwng_uid=2e824b08-70ef-41f7-a9c8-62ff25d8f920; AWSELB=258D9D590E00B3DE939BD2301A2166BB8314D5BFDDA88D29F0E3F22E0935E83EF1C408A6B613204775BA26EA9BE8555ABB5A13289EDD9FCE01B44987A799A50A15E49578ED9A0D7D28BE3696012F59FED65EA97193', 'Connection': 'keep-alive' } url_p = 'https://api.growingio.com/v2/eee5a46c52000d401f969f4535bdaa78/web/pv?stm=1459928218615' while flag: print url keyword_list = extract(aa.text, tp) print keyword_list print '---------------' keyword_all.extend(keyword_list) print keyword_all headers['refer'] = url url = url2 + "?page={}".format(page) print url headers2['refer'] = url aa = common.get_request(url, timeout=18) # bb = common.post_request(url_p, headers=headers2, data='6\x86\xf0D\x08`\x96`\\`S$\x15\x82\x01`\x1b\x01\x8d\x90&\x002\x10\x09\x9a\xf8\x08\xc0\x19\x80\x9c\x19QZ\xc8\x0c\xcc\x80FDA\x00\xec\x00q\x80\r$\x00np\xc3\xe0\x07H\xd7\x80\x96"\x03\xa8!\x90 +\x88\xeeX+w\xcb\x8b.\x00\xb4dru\xd6\x84\xd1]\x11\x91`\x8b\xa7\x91\n\x17\xf0\xf5\xcd\xca\xbf0\x01\x9dUc)\xd7\'df\xdchDd\xc6\xa4\x8c\xba\xdc\x08&\xba\x92\x08DATT\xf8\xc8\xf8R`\x00."\x82\xeeY\x02\x19\x00\xb6pd\x0cI\xce\xb8d\xdc\xc8<\x02\xee\x00\x16p\x9c\x8c\x18\xb5\x00\xeeph%\x02D"\xad}bP\x19\x00VJ\x00^PbX\x00\xf6E\x02\x00\x0e"\x00\xf4\xd3\x05s\x10\x00v\x00\x9en\x00N\x14"u\x19\x19s\xb0\x8b\x8b}\xad\x03\xc3c\x13+\xcb3k[\x00\xfck\x00\xe6\x08\x00\xbc\xb8\x9dhn\x006"Q\x9dWE\x87Y\xb8\x00\x8e"7\xa7\xdb\xe6G\xc1\x80\x00\xbe|p\n\x9e\x06\xa0\xd1ht\xfaC\x18X.d\xb3Yl\xf6T\x93\x85\xc6\xe4\xf1\xa3\xbc\xbe\x7f X*\x113\xe0"Q\x18\x9cA&\x82H\xa4\xd2n\x0c\x80>\x08\x01\xb4T\x00\x7f*\x01\xd0\x95\x00\x16\x11\x80W\x0c\x80\x01\x00\x07\xc2P\x04\x90\x00\xaa\x00T-\x00\n\xda\xdc\xa8D\x1d\xed\xca*\xc0J\xc82\xb7\x02\xa5Q\xa9\x80\xe6Yx!\xd8\xe6\xe1\xeb\xc0.W\x11\xb8\xd2c3p-\xe0\xf7U\x86\xdb`#\xd8\x1c\x8e\'3\xad\xb0om\xb8\xcc\xdd\x8fM\x8b\xc3S\x09\xf8C\xd5\xef/\xa7^\x10\x88\x02\xe9\x00') # print bb.content common.get_request('http://www.itjuzi.com/company/{}'.format( random.choice(range(1, 500))), timeout=10) store_path = os.path.join(common.root_path, 'juzi', str(page) + '.html') with open(store_path, 'w+') as f: f.write(aa.text.encode('utf8')) common.rand_sleep(30, 15) if len(l_list) == 0: flag = False else: page = random.choice(l_list) l_list.remove(page) return keyword_all
def one_page(html, source): ll = link_list(html) common.rand_sleep(3, 1) for i in ll: url, title = i.split(',') logging.debug('next url is {}'.format(url)) if not sql_se(source, title): r2 = common.get_request(url) title2, content, pub_time= page_parse(r2.text) common.sql_insert(source, url, title, content, pub_time, '') common.rand_sleep(6, 2)
def count_articles_in_topic(topic_id): '''计算主题下一周内的文章数量''' result = 0 tp_base_url = 'http://www.tuicool.com/topics/{}'.format(topic_id)\ + '?st=0&lang=1&pn={}' page = 0 while 1: try: cur_url = tp_base_url.format(page) print cur_url common.rand_sleep(5, 10) res = l.session.get(cur_url) logging.info('return url {} success'.format(res.url)) soup = BeautifulSoup(res.text, 'html.parser') # 获得文章列表 articles_list = soup.find_all('div', class_='single_fake') for article in articles_list: article_id = str(article.find('a', class_='article-list-title')\ .get('href').split('/')[2]) article_title = str(article.find('a', class_='article-list-title')\ .get_text()).strip() pub_time = str(article.find('div', class_='meta-tip')\ .find_all('span')[1].get_text()).strip() if pub_time.find('201') == -1: pub_time = '2016-' + pub_time + ':00' else: return result # 判断时间是否在一周内 timedelta = datetime.date.today()-datetime.datetime\ .strptime(pub_time, '%Y-%m-%d %H:%M:%S').date() if timedelta.days <= 7: result += 1 else: return result # 判断是否有下一页 page += 1 cur_url = tp_base_url.format(page) re_str = r'/topics/{}\?st=0&lang=1&pn={}'.format(topic_id, page) pat = re.compile(re_str) s_r = re.search(pat, res.text) if s_r is None: return result except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def get_page_one(url): r2 = common.get_request(url) cc = content(r2.text) # print cc if morepage(r2.text): common.rand_sleep(3, 2) r3 = common.get_request(url + '?p=1') # 假如有两页 logger.info('{} has two page, try to get page one'.format(url)) cc += content(r3.text) dd = common.re_email(cc) print dd ee = list(set(dd)) ff = ','.join(ee) return ff
def main(blog_name): sql_name = 'cnblog_' + blog_name page = 1 flag = True url_0 = "http://www.cnblogs.com/{}/".format(blog_name) url_1 = "http://www.cnblogs.com/{}/".format(blog_name) while flag: print url_1 try: bb = common.get_request(url_1) logging.info('return url {} success '.format(bb.url)) print bb.url soup_2 = BeautifulSoup(bb.text, 'html.parser') with open('asdf.html', 'w+') as f: f.write(bb.text.encode('utf8')) b2 = soup_2.find_all( 'a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')}) # 某页的文章链接 for i_text in b2: article_url = i_text.get('href') print article_url logging.info('article is {}'.format(article_url)) article_title = i_text.get_text().strip() if not common.select(article_url, blog_name): article = common.get_request(article_url) pub_time = common.re_time(article.text) keyword, content = extract(article.text) blog_id, blog_app, post_id = blog_info(article.text) keyword = kword(blog_id, blog_app, post_id) common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = url_0 + r'default\S+page={}'.format(page) print re_str pp = re.compile(re_str) ppp = re.search(pp, bb.text) if ppp is None: flag = False else: url_1 = ppp.group() common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def get_all_tags(): tags_url = 'http://www.huxiu.com/tags' common.rand_sleep(5, 5) res = s.get(tags_url) res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') with open('temp.html', 'w') as f0: f0.write(res.text) tag_boxs = soup.find_all('div', class_='tag-cnt-box') with open('huxiu_tags.txt', 'w') as f: for box in tag_boxs: tags_list = box.find_all('li', class_='transition') for tag in tags_list: tag_id = tag.find('a').get('href').split('/')[-1].split('.')[0] tag_name = tag.find('a').get_text().encode('utf8').strip() # print tag_id, tag_name f.write(str(tag_id) + ':' + tag_name + '\n')
def main(): source = 'mux' page = 1 flag = True url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page) while flag: try: print url res = common.get_request(url) logging.info('return url {} success'.format(res.url)) print res.url soup = BeautifulSoup(res.text, 'html.parser') with open('temp.html', 'w+') as f: f.write(res.text.encode('utf8')) articles = soup.find_all('div', class_='artical_inner') for item in articles: contents = item.contents article_url = contents[9].a.get('href') article_title = str(contents[3].a.get('title')).strip() if not common.select(article_url, source): pub_time = time.strftime('%Y-%m-%d',\ time.strptime(str(contents[5].get_text()).split('|')[-1].strip(), '%Y年%m月%d日')) keyword = str( contents[5].get_text()).split('|')[-2].strip() content = get_content( common.get_request(article_url).text) print article_title common.sql_insert(source, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = r'http://mux.baidu.com/\?page_id=10\S+paged={}'.format( page) pat = re.compile(re_str) s_r = re.search(pat, res.text) if s_r is None: flag = False else: url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page) common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def main(blog_name): sql_name = 'cnblog_' + blog_name page = 1 flag = True url_0 = "http://www.cnblogs.com/{}/".format(blog_name) url_1 = "http://www.cnblogs.com/{}/".format(blog_name) while flag: print url_1 try: bb = common.get_request(url_1) logging.info('return url {} success '.format(bb.url)) print bb.url soup_2 = BeautifulSoup(bb.text, 'html.parser') with open('asdf.html', 'w+') as f: f.write(bb.text.encode('utf8')) b2 = soup_2.find_all('a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')}) # 某页的文章链接 for i_text in b2: article_url = i_text.get('href') print article_url logging.info('article is {}'.format(article_url)) article_title = i_text.get_text().strip() if not common.select(article_url, blog_name): article = common.get_request(article_url) pub_time = common.re_time(article.text) keyword, content = extract(article.text) blog_id, blog_app, post_id = blog_info(article.text) keyword = kword(blog_id, blog_app, post_id) common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = url_0 + r'default\S+page={}'.format(page) print re_str pp = re.compile(re_str) ppp = re.search(pp, bb.text) if ppp is None: flag = False else: url_1 = ppp.group() common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def main(): rootdir = os.getcwd() print rootdir try: company_list_dir = os.path.join(rootdir, 'juzi') # for subdir, dirs, files in os.walk(company_list_dir): # for file in files: # logger.info('current file is {}'.format(file)) # fff = os.path.join(subdir, file) # with open(fff) as f: # ff = f.read() # url_list = find_all_link(ff) # print url_list # for i in url_list: for num in xrange(35770, 36000): i = 'http://www.itjuzi.com/company/' + str(num) try: logger.info('current url is {}'.format(i)) juzi_id = i.replace('http://www.itjuzi.com/company/', '') if not sql_sel(juzi_id): logger.info('try to insert {} into mysql'.format(juzi_id)) gs_fp = os.path.join(rootdir, 'juzicompany') if not os.path.exists(gs_fp): os.makedirs(gs_fp) job_id = str(juzi_id) job_id = job_id.rjust(5, '0') store_path = os.path.join(gs_fp,job_id[0:3], job_id +'.html') father_dir=os.path.dirname(store_path) if not os.path.exists(father_dir): os.makedirs(father_dir) r = common.get_request(i) if r: with open(store_path, 'w+') as f: f.write(r.text) ll = parse_page(r.text) sql_in(juzi_id, ll) common.rand_sleep(5, 2) except: logger.error('something wrong ', exc_info=True) except: logger.error('something wrong ', exc_info=True)
def login(self): '''返回cookie_str''' # 先获得authenticity_token common.rand_sleep(5, 10) res = self.session.get(self.url) soup = BeautifulSoup(res.text, 'html.parser') authenticity_token = soup.find('meta', attrs={'name': 'csrf-token'})\ ['content'] print 'authenticity_token: ' + authenticity_token self.req_params['authenticity_token'] = authenticity_token # 使用用户名密码模拟登录 common.rand_sleep(5, 10) res = self.session.post(self.url, data=self.req_params,\ verify=False) cookie = requests.utils.dict_from_cookiejar(self.session.cookies) cookie_str = "; ".join([str(x)+"="+str(y) for x, y in cookie.items()]) self.cookie_str = cookie_str self.session.headers['Cookie'] = cookie_str print 'cookie_str: ' + cookie_str return cookie_str
def login(self): '''返回cookie_str''' # 先获得authenticity_token common.rand_sleep(5, 10) res = self.session.get(self.url) soup = BeautifulSoup(res.text, 'html.parser') authenticity_token = soup.find('meta', attrs={'name': 'csrf-token'})\ ['content'] print 'authenticity_token: ' + authenticity_token self.req_params['authenticity_token'] = authenticity_token # 使用用户名密码模拟登录 common.rand_sleep(5, 10) res = self.session.post(self.url, data=self.req_params,\ verify=False) cookie = requests.utils.dict_from_cookiejar(self.session.cookies) cookie_str = "; ".join( [str(x) + "=" + str(y) for x, y in cookie.items()]) self.cookie_str = cookie_str self.session.headers['Cookie'] = cookie_str print 'cookie_str: ' + cookie_str return cookie_str
def main(): # 登录 l.login() # 获得主题数据的api地址 base_url = 'http://www.tuicool.com/topics/my_hot?id=1' try: common.rand_sleep(5, 10) res = l.session.get(base_url) logging.info('return url {} success'.format(res.url)) res_data = json.loads(res.text) result = {} # 主题分类列表 class_list = res_data['cats'] with open('articles_count0.txt', 'w') as f0: for class_item in class_list: class_id_name = str(class_item['id'])\ + '_' + class_item['name'].encode('utf8') # 主题列表 topic_list = class_item['items'] for topic in topic_list: topic_id_name = class_id_name + '_' + str(topic['id'])\ + '_' + topic['name'].encode('utf8') num = count_articles_in_topic(str(topic['id'])) result[topic_id_name] = num print topic_id_name, num f0.write(topic_id_name + ': ' + str(num) + '\n') # 按主题名排序 result = collections.OrderedDict(sorted(\ result.items(), key = lambda t: t[0])) with open('articles_count.txt', 'w') as f: for topic_id, num in result.iteritems(): f.write(topic_id + ': ' + str(num) + '\n') except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def run(url, tp): aa = common.get_request(url, timeout=8) url2 = url page = 1 flag = True keyword_all = [] while flag: print url keyword_list = extract(aa.text, tp) print keyword_list print '---------------' keyword_all.extend(keyword_list) print keyword_all if aa.text.find(u'下一页') < 0: flag = False else: page += 1 headers['refer'] = url url = url2 + "?page={}".format(page) print url aa = common.get_request(url, headers=headers, timeout=8) common.rand_sleep(9, 4) return keyword_all
def get_article_num(tag_name): num = 0 url = 'https://segmentfault.com/t/{}/blogs'.format(urllib.quote(tag_name)) try: common.rand_sleep(5, 10) res = s.get(url) soup = BeautifulSoup(res.text, 'html.parser') pagination = soup.find('ul', class_='pagination') if pagination is None: article_list = soup.find_all('section', class_='stream-list__item') num = len(article_list) else: url += '?page=1000' common.rand_sleep(5, 10) res = s.get(url) soup = BeautifulSoup(res.text, 'html.parser') pagination = soup.find('ul', class_='pagination') total_page = pagination.find('li', class_='active').find('a').get_text()\ .encode('utf8') num = int(total_page) * 15 except Exception, e: logging.error('run error', exc_info=True) return num