Пример #1
0
def save_one_page_movie(link):
	con2 = get_and_sleep(link)
	#savecontent("test"+str(page)+"a.html",con2)
	soup2 = BeautifulSoup(con2)
	list2 = soup2.find_all("a", attrs={"class": "movie-box"})
	for la2 in list2:#每个作品
		link2 = la2.find("span")
		list3 = link2.find_all("date")
		savecontent("test" + str(page) + "a.html","\n" + link2.contents[0])#作品名
		savecontent("test" + str(page) + "a.html","\n" + list3[0].get_text())#番号
		savecontent("test" + str(page) + "a.html","\n" + list3[1].get_text())#日期
		link3 = la2["href"]
		con3 = get_and_sleep(link3)
		soup3 = BeautifulSoup(con3)
		movie = soup3.find("div", attrs={"class": "col-md-3 info"}).find_all("p")[2]
		#m1 = movie.contents[0]
		#print type(m1)
		#print m1.get_text()
		duration = movie.get_text()	
		#print duration
		savecontent("test" + str(page) + "a.html","\n" + duration)#时长
		savecontent("test" + str(page) + "a.html","\n" + link3)#链接
		#break #break on one page one movie
	#return next page link
	next = soup2.find("a", attrs={"name": "nextpage"})
	if(next is not None):
		nextlink = next["href"]
		return nextlink
 def unescape(cls, string):
     html_parser = HTMLParser.HTMLParser()
     html = html_parser.unescape(string)
     html = html[1 + html.find('>'):html.rfind('<')].strip()
     e_time = ''
     try:
         text = BeautifulSoup(html).find('a', class_='answer-date-link last_updated meta-item').getText().strip()
         if text.startswith(u'编辑于'):
             e_time = parse_time(text)
     except:
         pass
     return html, e_time
Пример #3
0
 def find_imgs(self, uri):
     url = HOST + uri
     soup = BeautifulSoup(self.get(url))
     img_list = []
     for input in soup.find_all('input', type="image"):
         img = input['src']
         content = self.get(img)
         filename = sha1(content) + img[img.rfind('.'):]
         save(content, filename)
         img_list.append({
             'url': img,
             'hash': filename,
         })
     return img_list
Пример #4
0
 def find_imgs(self, uri):
     url = HOST + uri
     soup = BeautifulSoup(self.get(url))
     img_list = []
     for input in soup.find_all('input', type="image"):
         img = input['src']
         content = self.get(img)
         filename = sha1(content) + img[img.rfind('.'):]
         save(content, filename)
         img_list.append({
             'url': img,
             'hash': filename,
         })
     return img_list
Пример #5
0
 def unescape(cls, string):
     html_parser = HTMLParser.HTMLParser()
     html = html_parser.unescape(string)
     html = html[1 + html.find('>'):html.rfind('<')].strip()
     e_time = ''
     try:
         text = BeautifulSoup(html).find(
             'a', class_='answer-date-link last_updated meta-item').getText(
             ).strip()
         if text.startswith(u'编辑于'):
             e_time = parse_time(text)
     except:
         pass
     return html, e_time
Пример #6
0
def main(uid):
    client = Client(APP_KEY, APP_SECRET, CALLBACK_URL, username=USERID, password=PASSWD)
    data = client.get('statuses/friends_timeline')
    statuses = [status for status in data['statuses'] if status['user']['id'] == uid]
    statuses.sort(key=lambda x: x['id'], reverse=True)
    if not statuses:
        return
    weibo = get_weibo(uid)
    newest = get_user(statuses[0])

    if weibo.user is None:
        weibo.user = newest

    diff = weibo.user.diff(newest)

    if diff:
        weibo.user = newest
        send_noti(u'{} 的微博资料更新'.format(weibo.user.name),
                  u'{} 的微博资料有如下更新:\n{}'.format(weibo.user.name, u'\n'.join(diff)))

    tweet = get_tweet(statuses[0])
    has_new = weibo.last != tweet.id

    if has_new:
        weibo.last = tweet.id
        weibo.tweets.append(tweet)
        send_noti(u'{} 发新微博啦~'.format(weibo.user.name),
                  u'{} 通过【{}】发送了一条微博,内容是:\n{}'.format(
                      weibo.user.name,
                      BeautifulSoup(tweet.source).getText(),
                      tweet.text
                  ))

    if has_new or diff:
        save_weibo(uid, weibo)
Пример #7
0
 def run(self):
     for page in xrange(self.end, self.start - 1, -1):
         logger.info('crawl t66y page %s' % page)
         html = self.get(HOST + 'thread0806.php?fid=8&search=&page=' +
                         str(page))
         soup = BeautifulSoup(html)
         self.parse_catalog(soup)
Пример #8
0
def save_one_page_movie(link):
    con2 = get_and_sleep(link)
    #savecontent("test"+str(page)+"a.html",con2)
    soup2 = BeautifulSoup(con2)
    list2 = soup2.find_all("a", attrs={"class": "movie-box"})
    for la2 in list2:  #每个作品
        link2 = la2.find("span")
        list3 = link2.find_all("date")
        savecontent("test" + str(page) + "a.html",
                    "\n" + link2.contents[0])  #作品名
        savecontent("test" + str(page) + "a.html",
                    "\n" + list3[0].get_text())  #番号
        savecontent("test" + str(page) + "a.html",
                    "\n" + list3[1].get_text())  #日期
        link3 = la2["href"]
        con3 = get_and_sleep(link3)
        soup3 = BeautifulSoup(con3)
        movie = soup3.find("div", attrs={
            "class": "col-md-3 info"
        }).find_all("p")[2]
        #m1 = movie.contents[0]
        #print type(m1)
        #print m1.get_text()
        duration = movie.get_text()
        #print duration
        savecontent("test" + str(page) + "a.html", "\n" + duration)  #时长
        savecontent("test" + str(page) + "a.html", "\n" + link3)  #链接
        #break #break on one page one movie
    #return next page link
    next = soup2.find("a", attrs={"name": "nextpage"})
    if (next is not None):
        nextlink = next["href"]
        return nextlink
Пример #9
0
 def _run(self, offset):
     result = []
     data = {
         'method': 'next',
         'params': '{"topic_id":%s,"offset":%s,"hash_id":""}' % (self.topic_id, offset),
         '_xsrf': '690bc39a80e99e2089d2e0cb0f504e0d'
     }
     resp = self.post(TOPIC_URL, data)
     msgs = loads(resp)['msg']
     for msg in msgs:
         soup = BeautifulSoup(msg)
         a = soup.find('a')
         link = a['href'].strip()
         result.append({
             '_id': link[link.rfind('/')+1:],
             'link': ZHIHU_URL + link,
             'topic': a.getText().strip(),
             'icon': a.find('img')['src'].strip(),
         })
     return result
Пример #10
0
 def jia(self, con1):
     # https://avmo.pw/cn/actresses//page/2
     # logging.error("call jia")
     soup1 = BeautifulSoup(con1["value"])
     list = soup1.find_all("a", attrs={"class": "avatar-box text-center"})
     for la in list:  #每个演员
         #savecontent("\n@@@" + la.find("span").get_text())    #演员名@@@开始
         actor = la.find("span").get_text()
         link = la["href"]  #la.get("href")
         item = {"type": "2", "value": link, "actor": actor}
         #con1["value"] = link
         #con1["type"] = "2"
         #con1["actor"] = actor
         # logging.error("jia put" + link)
         self.url_queue.put(item)
     # 下一页
     next = soup1.find("a", attrs={"name": "nextpage"})
     if (next is not None):
         nextlink = next["href"]
         # logging.error("jia next =:" + nextlink)
         item = {"type": "1", "value": "https://avmo.pw" + nextlink}
         self.url_queue.put(item)
Пример #11
0
    def run(self, question_id):
        html = self.get(QUESTION_URL.format(id=question_id))
        soup = BeautifulSoup(html)
        question = QuestionParser(self, soup).parse()
        question['_id'] = question_id

        for index in forever():
            ids, has_more = self._run(question_id, index * 20)
            question['answers'].extend(ids)
            QuestionParser.save(question)
            logger.info('update question %s-%s' % (index, question_id))
            if not has_more:
                break
Пример #12
0
 def _find_following(self, offset):
     data = {
         'method':
         'next',
         'params':
         '{"offset":%s,"order_by":"created","hash_id":"%s"}' %
         (offset, self.data['_id']),
         '_xsrf':
         '在登录之后的网页上复制出这个值',
     }
     html = self.post('http://www.zhihu.com/node/ProfileFolloweesListV2',
                      data=data)
     data = json.loads(html)
     rst = []
     for item in data['msg']:
         soup = BeautifulSoup(item)
         a = soup.find('a')
         rst.append({
             'selfuid': a.get('href', '')[8:],
             '_id': soup.find('button')['data-id'],
             'nickname': a.get('title', '')
         })
     return rst
Пример #13
0
 def _run(self, offset):
     result = []
     data = {
         'method':
         'next',
         'params':
         '{"topic_id":%s,"offset":%s,"hash_id":""}' %
         (self.topic_id, offset),
         '_xsrf':
         '690bc39a80e99e2089d2e0cb0f504e0d'
     }
     resp = self.post(TOPIC_URL, data)
     msgs = loads(resp)['msg']
     for msg in msgs:
         soup = BeautifulSoup(msg)
         a = soup.find('a')
         link = a['href'].strip()
         result.append({
             '_id': link[link.rfind('/') + 1:],
             'link': ZHIHU_URL + link,
             'topic': a.getText().strip(),
             'icon': a.find('img')['src'].strip(),
         })
     return result
Пример #14
0
 def jian(self, con2):
     # logging.error("call jian")
     soup2 = BeautifulSoup(con2["value"])
     list2 = soup2.find_all("a", attrs={"class": "movie-box"})
     for la2 in list2:  #每个作品
         link2 = la2.find("span")
         # list3 = link2.find_all("date")
         #savecontent("\n" + link2.contents[0])#作品名
         #savecontent("\n" + list3[0].get_text())#番号
         #savecontent("\n" + list3[1].get_text())#日期
         link3 = la2["href"]
         # logging.error("jian get " + link3)
         con3 = get_and_sleep(link3)
         # logging.error("jian got " + link3)
         soup3 = BeautifulSoup(con3)
         title = soup3.find_all("div", attrs={"class":
                                              "container"})[1].find("h3")
         movie = soup3.find("div", attrs={
             "class": "col-md-3 info"
         }).find_all("p")
         #m1 = movie.contents[0]
         #print type(m1)
         #print m1.get_text()
         # duration = movie.get_text()
         # print duration[2]
         # logging.error("jian get " + title.get_text())
         # logging.error("jian get " + movie[0].get_text())
         # logging.error("jian get " + movie[1].get_text())
         # logging.error("jian get " + movie[2].get_text())
         # logging.error("call savecontent")
         self.fp.write("\n" + link2.contents[0] + "\n" +
                       movie[0].get_text() + "\n" + movie[1].get_text() +
                       "\n" + movie[2].get_text() + "\n演员:" + con2["actor"])
         #savecontent("\n" + link3)#链接
     #flush()
     self.fp.flush()
     next = soup2.find("a", attrs={"name": "nextpage"})
     if (next is not None):
         nextlink = next["href"]
         # logging.error("jian next=" + nextlink)
         con2["value"] = "https://avmo.pw" + nextlink
         #item = {"type":"2","value":"https://avmo.pw" + nextlink, "actor":con2["actor"]}
         self.url_queue.put(con2)
Пример #15
0
    def run(self):
        base_url = TOPIC_URL.format(tid=self.topic_id)
        for page in xrange(1, 10):
            url = base_url + str(page)
            logger.info('crawling topic %s, page %s' % (self.topic_id, page))

            try:
                html = self.get(url)
                soup = BeautifulSoup(html)
            except:
                return

            for answer in self._run(soup):
                answer['topics'].append(self.topic_id)
                answer['updateTime'] = now()
                result = self.save(answer)
                if result < 0:
                    return
Пример #16
0
    def _run(self, question_id, offset):
        data = {
            'method':
            'next',
            'params':
            '{"url_token":%s,"pagesize":20,"offset":%s}' %
            (question_id, offset),
            '_xsrf':
            self.xsrf,
        }
        rst_json = json.loads(
            self.post(QUESTION_XHR_URL, data=data, timeout=60))

        rst = []
        for index, item in enumerate(rst_json['msg']):
            sleep(1)
            soup = BeautifulSoup(item)
            answer = AnswerParser(
                self, soup).parse_imgs()  # self.build_answer_item(soup)
            if not answer:
                continue
            AnswerParser.save(answer)
            rst.append(answer['_id'])
        return rst, len(rst_json['msg']) == 20
Пример #17
0
	if(next is not None):
		nextlink = next["href"]
		return nextlink

if __name__ == '__main__':
	session = requests.session()
	logging.basicConfig(
	filename='server.log',
	level=logging.INFO,
	format='%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s', 
	datefmt='[/%Y/%m%d-%H:%M:%S]')
	try:
		for page in range(1,200): #不含最大值
			#print page
			con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' + str(page))
			#savecontent("test"+str(page)+".html",con1)
			#从中找出名字和链接
			soup1 = BeautifulSoup(con1)
			list=soup1.find_all("a", attrs={"class": "avatar-box text-center"})
			for la in list:#每个演员
				savecontent("test" + str(page) + "a.html","\n@@@" + la.find("span").get_text())	#演员名@@@开始
				link = la["href"]#la.get("href")
				#一页作品
				nextlink = save_one_page_movie(link)
				#下一页
				while (nextlink is not None):
					nextlink = save_one_page_movie("https://avmo.pw" + nextlink)
				#break #break one actor
	except:
		logging.exception("exception")
Пример #18
0
if __name__ == '__main__':
    session = requests.session()
    logging.basicConfig(
        filename='server.log',
        level=logging.INFO,
        format=
        '%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s',
        datefmt='[/%Y/%m%d-%H:%M:%S]')
    try:
        for page in range(1, 200):  #不含最大值
            #print page
            con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' +
                                 str(page))
            #savecontent("test"+str(page)+".html",con1)
            #从中找出名字和链接
            soup1 = BeautifulSoup(con1)
            list = soup1.find_all("a",
                                  attrs={"class": "avatar-box text-center"})
            for la in list:  #每个演员
                savecontent("test" + str(page) + "a.html",
                            "\n@@@" + la.find("span").get_text())  #演员名@@@开始
                link = la["href"]  #la.get("href")
                #一页作品
                nextlink = save_one_page_movie(link)
                #下一页
                while (nextlink is not None):
                    nextlink = save_one_page_movie("https://avmo.pw" +
                                                   nextlink)
                #break #break one actor
    except:
        logging.exception("exception")