def save_one_page_movie(link): con2 = get_and_sleep(link) #savecontent("test"+str(page)+"a.html",con2) soup2 = BeautifulSoup(con2) list2 = soup2.find_all("a", attrs={"class": "movie-box"}) for la2 in list2:#每个作品 link2 = la2.find("span") list3 = link2.find_all("date") savecontent("test" + str(page) + "a.html","\n" + link2.contents[0])#作品名 savecontent("test" + str(page) + "a.html","\n" + list3[0].get_text())#番号 savecontent("test" + str(page) + "a.html","\n" + list3[1].get_text())#日期 link3 = la2["href"] con3 = get_and_sleep(link3) soup3 = BeautifulSoup(con3) movie = soup3.find("div", attrs={"class": "col-md-3 info"}).find_all("p")[2] #m1 = movie.contents[0] #print type(m1) #print m1.get_text() duration = movie.get_text() #print duration savecontent("test" + str(page) + "a.html","\n" + duration)#时长 savecontent("test" + str(page) + "a.html","\n" + link3)#链接 #break #break on one page one movie #return next page link next = soup2.find("a", attrs={"name": "nextpage"}) if(next is not None): nextlink = next["href"] return nextlink
def unescape(cls, string): html_parser = HTMLParser.HTMLParser() html = html_parser.unescape(string) html = html[1 + html.find('>'):html.rfind('<')].strip() e_time = '' try: text = BeautifulSoup(html).find('a', class_='answer-date-link last_updated meta-item').getText().strip() if text.startswith(u'编辑于'): e_time = parse_time(text) except: pass return html, e_time
def find_imgs(self, uri): url = HOST + uri soup = BeautifulSoup(self.get(url)) img_list = [] for input in soup.find_all('input', type="image"): img = input['src'] content = self.get(img) filename = sha1(content) + img[img.rfind('.'):] save(content, filename) img_list.append({ 'url': img, 'hash': filename, }) return img_list
def unescape(cls, string): html_parser = HTMLParser.HTMLParser() html = html_parser.unescape(string) html = html[1 + html.find('>'):html.rfind('<')].strip() e_time = '' try: text = BeautifulSoup(html).find( 'a', class_='answer-date-link last_updated meta-item').getText( ).strip() if text.startswith(u'编辑于'): e_time = parse_time(text) except: pass return html, e_time
def main(uid): client = Client(APP_KEY, APP_SECRET, CALLBACK_URL, username=USERID, password=PASSWD) data = client.get('statuses/friends_timeline') statuses = [status for status in data['statuses'] if status['user']['id'] == uid] statuses.sort(key=lambda x: x['id'], reverse=True) if not statuses: return weibo = get_weibo(uid) newest = get_user(statuses[0]) if weibo.user is None: weibo.user = newest diff = weibo.user.diff(newest) if diff: weibo.user = newest send_noti(u'{} 的微博资料更新'.format(weibo.user.name), u'{} 的微博资料有如下更新:\n{}'.format(weibo.user.name, u'\n'.join(diff))) tweet = get_tweet(statuses[0]) has_new = weibo.last != tweet.id if has_new: weibo.last = tweet.id weibo.tweets.append(tweet) send_noti(u'{} 发新微博啦~'.format(weibo.user.name), u'{} 通过【{}】发送了一条微博,内容是:\n{}'.format( weibo.user.name, BeautifulSoup(tweet.source).getText(), tweet.text )) if has_new or diff: save_weibo(uid, weibo)
def run(self): for page in xrange(self.end, self.start - 1, -1): logger.info('crawl t66y page %s' % page) html = self.get(HOST + 'thread0806.php?fid=8&search=&page=' + str(page)) soup = BeautifulSoup(html) self.parse_catalog(soup)
def save_one_page_movie(link): con2 = get_and_sleep(link) #savecontent("test"+str(page)+"a.html",con2) soup2 = BeautifulSoup(con2) list2 = soup2.find_all("a", attrs={"class": "movie-box"}) for la2 in list2: #每个作品 link2 = la2.find("span") list3 = link2.find_all("date") savecontent("test" + str(page) + "a.html", "\n" + link2.contents[0]) #作品名 savecontent("test" + str(page) + "a.html", "\n" + list3[0].get_text()) #番号 savecontent("test" + str(page) + "a.html", "\n" + list3[1].get_text()) #日期 link3 = la2["href"] con3 = get_and_sleep(link3) soup3 = BeautifulSoup(con3) movie = soup3.find("div", attrs={ "class": "col-md-3 info" }).find_all("p")[2] #m1 = movie.contents[0] #print type(m1) #print m1.get_text() duration = movie.get_text() #print duration savecontent("test" + str(page) + "a.html", "\n" + duration) #时长 savecontent("test" + str(page) + "a.html", "\n" + link3) #链接 #break #break on one page one movie #return next page link next = soup2.find("a", attrs={"name": "nextpage"}) if (next is not None): nextlink = next["href"] return nextlink
def _run(self, offset): result = [] data = { 'method': 'next', 'params': '{"topic_id":%s,"offset":%s,"hash_id":""}' % (self.topic_id, offset), '_xsrf': '690bc39a80e99e2089d2e0cb0f504e0d' } resp = self.post(TOPIC_URL, data) msgs = loads(resp)['msg'] for msg in msgs: soup = BeautifulSoup(msg) a = soup.find('a') link = a['href'].strip() result.append({ '_id': link[link.rfind('/')+1:], 'link': ZHIHU_URL + link, 'topic': a.getText().strip(), 'icon': a.find('img')['src'].strip(), }) return result
def jia(self, con1): # https://avmo.pw/cn/actresses//page/2 # logging.error("call jia") soup1 = BeautifulSoup(con1["value"]) list = soup1.find_all("a", attrs={"class": "avatar-box text-center"}) for la in list: #每个演员 #savecontent("\n@@@" + la.find("span").get_text()) #演员名@@@开始 actor = la.find("span").get_text() link = la["href"] #la.get("href") item = {"type": "2", "value": link, "actor": actor} #con1["value"] = link #con1["type"] = "2" #con1["actor"] = actor # logging.error("jia put" + link) self.url_queue.put(item) # 下一页 next = soup1.find("a", attrs={"name": "nextpage"}) if (next is not None): nextlink = next["href"] # logging.error("jia next =:" + nextlink) item = {"type": "1", "value": "https://avmo.pw" + nextlink} self.url_queue.put(item)
def run(self, question_id): html = self.get(QUESTION_URL.format(id=question_id)) soup = BeautifulSoup(html) question = QuestionParser(self, soup).parse() question['_id'] = question_id for index in forever(): ids, has_more = self._run(question_id, index * 20) question['answers'].extend(ids) QuestionParser.save(question) logger.info('update question %s-%s' % (index, question_id)) if not has_more: break
def _find_following(self, offset): data = { 'method': 'next', 'params': '{"offset":%s,"order_by":"created","hash_id":"%s"}' % (offset, self.data['_id']), '_xsrf': '在登录之后的网页上复制出这个值', } html = self.post('http://www.zhihu.com/node/ProfileFolloweesListV2', data=data) data = json.loads(html) rst = [] for item in data['msg']: soup = BeautifulSoup(item) a = soup.find('a') rst.append({ 'selfuid': a.get('href', '')[8:], '_id': soup.find('button')['data-id'], 'nickname': a.get('title', '') }) return rst
def _run(self, offset): result = [] data = { 'method': 'next', 'params': '{"topic_id":%s,"offset":%s,"hash_id":""}' % (self.topic_id, offset), '_xsrf': '690bc39a80e99e2089d2e0cb0f504e0d' } resp = self.post(TOPIC_URL, data) msgs = loads(resp)['msg'] for msg in msgs: soup = BeautifulSoup(msg) a = soup.find('a') link = a['href'].strip() result.append({ '_id': link[link.rfind('/') + 1:], 'link': ZHIHU_URL + link, 'topic': a.getText().strip(), 'icon': a.find('img')['src'].strip(), }) return result
def jian(self, con2): # logging.error("call jian") soup2 = BeautifulSoup(con2["value"]) list2 = soup2.find_all("a", attrs={"class": "movie-box"}) for la2 in list2: #每个作品 link2 = la2.find("span") # list3 = link2.find_all("date") #savecontent("\n" + link2.contents[0])#作品名 #savecontent("\n" + list3[0].get_text())#番号 #savecontent("\n" + list3[1].get_text())#日期 link3 = la2["href"] # logging.error("jian get " + link3) con3 = get_and_sleep(link3) # logging.error("jian got " + link3) soup3 = BeautifulSoup(con3) title = soup3.find_all("div", attrs={"class": "container"})[1].find("h3") movie = soup3.find("div", attrs={ "class": "col-md-3 info" }).find_all("p") #m1 = movie.contents[0] #print type(m1) #print m1.get_text() # duration = movie.get_text() # print duration[2] # logging.error("jian get " + title.get_text()) # logging.error("jian get " + movie[0].get_text()) # logging.error("jian get " + movie[1].get_text()) # logging.error("jian get " + movie[2].get_text()) # logging.error("call savecontent") self.fp.write("\n" + link2.contents[0] + "\n" + movie[0].get_text() + "\n" + movie[1].get_text() + "\n" + movie[2].get_text() + "\n演员:" + con2["actor"]) #savecontent("\n" + link3)#链接 #flush() self.fp.flush() next = soup2.find("a", attrs={"name": "nextpage"}) if (next is not None): nextlink = next["href"] # logging.error("jian next=" + nextlink) con2["value"] = "https://avmo.pw" + nextlink #item = {"type":"2","value":"https://avmo.pw" + nextlink, "actor":con2["actor"]} self.url_queue.put(con2)
def run(self): base_url = TOPIC_URL.format(tid=self.topic_id) for page in xrange(1, 10): url = base_url + str(page) logger.info('crawling topic %s, page %s' % (self.topic_id, page)) try: html = self.get(url) soup = BeautifulSoup(html) except: return for answer in self._run(soup): answer['topics'].append(self.topic_id) answer['updateTime'] = now() result = self.save(answer) if result < 0: return
def _run(self, question_id, offset): data = { 'method': 'next', 'params': '{"url_token":%s,"pagesize":20,"offset":%s}' % (question_id, offset), '_xsrf': self.xsrf, } rst_json = json.loads( self.post(QUESTION_XHR_URL, data=data, timeout=60)) rst = [] for index, item in enumerate(rst_json['msg']): sleep(1) soup = BeautifulSoup(item) answer = AnswerParser( self, soup).parse_imgs() # self.build_answer_item(soup) if not answer: continue AnswerParser.save(answer) rst.append(answer['_id']) return rst, len(rst_json['msg']) == 20
if(next is not None): nextlink = next["href"] return nextlink if __name__ == '__main__': session = requests.session() logging.basicConfig( filename='server.log', level=logging.INFO, format='%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s', datefmt='[/%Y/%m%d-%H:%M:%S]') try: for page in range(1,200): #不含最大值 #print page con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' + str(page)) #savecontent("test"+str(page)+".html",con1) #从中找出名字和链接 soup1 = BeautifulSoup(con1) list=soup1.find_all("a", attrs={"class": "avatar-box text-center"}) for la in list:#每个演员 savecontent("test" + str(page) + "a.html","\n@@@" + la.find("span").get_text()) #演员名@@@开始 link = la["href"]#la.get("href") #一页作品 nextlink = save_one_page_movie(link) #下一页 while (nextlink is not None): nextlink = save_one_page_movie("https://avmo.pw" + nextlink) #break #break one actor except: logging.exception("exception")
if __name__ == '__main__': session = requests.session() logging.basicConfig( filename='server.log', level=logging.INFO, format= '%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s', datefmt='[/%Y/%m%d-%H:%M:%S]') try: for page in range(1, 200): #不含最大值 #print page con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' + str(page)) #savecontent("test"+str(page)+".html",con1) #从中找出名字和链接 soup1 = BeautifulSoup(con1) list = soup1.find_all("a", attrs={"class": "avatar-box text-center"}) for la in list: #每个演员 savecontent("test" + str(page) + "a.html", "\n@@@" + la.find("span").get_text()) #演员名@@@开始 link = la["href"] #la.get("href") #一页作品 nextlink = save_one_page_movie(link) #下一页 while (nextlink is not None): nextlink = save_one_page_movie("https://avmo.pw" + nextlink) #break #break one actor except: logging.exception("exception")