예제 #1
0
파일: py2-av.py 프로젝트: nkzxw/repo
def save_one_page_movie(link):
	con2 = get_and_sleep(link)
	#savecontent("test"+str(page)+"a.html",con2)
	soup2 = BeautifulSoup(con2)
	list2 = soup2.find_all("a", attrs={"class": "movie-box"})
	for la2 in list2:#每个作品
		link2 = la2.find("span")
		list3 = link2.find_all("date")
		savecontent("test" + str(page) + "a.html","\n" + link2.contents[0])#作品名
		savecontent("test" + str(page) + "a.html","\n" + list3[0].get_text())#番号
		savecontent("test" + str(page) + "a.html","\n" + list3[1].get_text())#日期
		link3 = la2["href"]
		con3 = get_and_sleep(link3)
		soup3 = BeautifulSoup(con3)
		movie = soup3.find("div", attrs={"class": "col-md-3 info"}).find_all("p")[2]
		#m1 = movie.contents[0]
		#print type(m1)
		#print m1.get_text()
		duration = movie.get_text()	
		#print duration
		savecontent("test" + str(page) + "a.html","\n" + duration)#时长
		savecontent("test" + str(page) + "a.html","\n" + link3)#链接
		#break #break on one page one movie
	#return next page link
	next = soup2.find("a", attrs={"name": "nextpage"})
	if(next is not None):
		nextlink = next["href"]
		return nextlink
예제 #2
0
def save_one_page_movie(link):
    con2 = get_and_sleep(link)
    #savecontent("test"+str(page)+"a.html",con2)
    soup2 = BeautifulSoup(con2)
    list2 = soup2.find_all("a", attrs={"class": "movie-box"})
    for la2 in list2:  #每个作品
        link2 = la2.find("span")
        list3 = link2.find_all("date")
        savecontent("test" + str(page) + "a.html",
                    "\n" + link2.contents[0])  #作品名
        savecontent("test" + str(page) + "a.html",
                    "\n" + list3[0].get_text())  #番号
        savecontent("test" + str(page) + "a.html",
                    "\n" + list3[1].get_text())  #日期
        link3 = la2["href"]
        con3 = get_and_sleep(link3)
        soup3 = BeautifulSoup(con3)
        movie = soup3.find("div", attrs={
            "class": "col-md-3 info"
        }).find_all("p")[2]
        #m1 = movie.contents[0]
        #print type(m1)
        #print m1.get_text()
        duration = movie.get_text()
        #print duration
        savecontent("test" + str(page) + "a.html", "\n" + duration)  #时长
        savecontent("test" + str(page) + "a.html", "\n" + link3)  #链接
        #break #break on one page one movie
    #return next page link
    next = soup2.find("a", attrs={"name": "nextpage"})
    if (next is not None):
        nextlink = next["href"]
        return nextlink
예제 #3
0
 def jian(self, con2):
     # logging.error("call jian")
     soup2 = BeautifulSoup(con2["value"])
     list2 = soup2.find_all("a", attrs={"class": "movie-box"})
     for la2 in list2:  #每个作品
         link2 = la2.find("span")
         # list3 = link2.find_all("date")
         #savecontent("\n" + link2.contents[0])#作品名
         #savecontent("\n" + list3[0].get_text())#番号
         #savecontent("\n" + list3[1].get_text())#日期
         link3 = la2["href"]
         # logging.error("jian get " + link3)
         con3 = get_and_sleep(link3)
         # logging.error("jian got " + link3)
         soup3 = BeautifulSoup(con3)
         title = soup3.find_all("div", attrs={"class":
                                              "container"})[1].find("h3")
         movie = soup3.find("div", attrs={
             "class": "col-md-3 info"
         }).find_all("p")
         #m1 = movie.contents[0]
         #print type(m1)
         #print m1.get_text()
         # duration = movie.get_text()
         # print duration[2]
         # logging.error("jian get " + title.get_text())
         # logging.error("jian get " + movie[0].get_text())
         # logging.error("jian get " + movie[1].get_text())
         # logging.error("jian get " + movie[2].get_text())
         # logging.error("call savecontent")
         self.fp.write("\n" + link2.contents[0] + "\n" +
                       movie[0].get_text() + "\n" + movie[1].get_text() +
                       "\n" + movie[2].get_text() + "\n演员:" + con2["actor"])
         #savecontent("\n" + link3)#链接
     #flush()
     self.fp.flush()
     next = soup2.find("a", attrs={"name": "nextpage"})
     if (next is not None):
         nextlink = next["href"]
         # logging.error("jian next=" + nextlink)
         con2["value"] = "https://avmo.pw" + nextlink
         #item = {"type":"2","value":"https://avmo.pw" + nextlink, "actor":con2["actor"]}
         self.url_queue.put(con2)
예제 #4
0
 def _find_following(self, offset):
     data = {
         'method':
         'next',
         'params':
         '{"offset":%s,"order_by":"created","hash_id":"%s"}' %
         (offset, self.data['_id']),
         '_xsrf':
         '在登录之后的网页上复制出这个值',
     }
     html = self.post('http://www.zhihu.com/node/ProfileFolloweesListV2',
                      data=data)
     data = json.loads(html)
     rst = []
     for item in data['msg']:
         soup = BeautifulSoup(item)
         a = soup.find('a')
         rst.append({
             'selfuid': a.get('href', '')[8:],
             '_id': soup.find('button')['data-id'],
             'nickname': a.get('title', '')
         })
     return rst
예제 #5
0
 def _run(self, offset):
     result = []
     data = {
         'method': 'next',
         'params': '{"topic_id":%s,"offset":%s,"hash_id":""}' % (self.topic_id, offset),
         '_xsrf': '690bc39a80e99e2089d2e0cb0f504e0d'
     }
     resp = self.post(TOPIC_URL, data)
     msgs = loads(resp)['msg']
     for msg in msgs:
         soup = BeautifulSoup(msg)
         a = soup.find('a')
         link = a['href'].strip()
         result.append({
             '_id': link[link.rfind('/')+1:],
             'link': ZHIHU_URL + link,
             'topic': a.getText().strip(),
             'icon': a.find('img')['src'].strip(),
         })
     return result
예제 #6
0
 def jia(self, con1):
     # https://avmo.pw/cn/actresses//page/2
     # logging.error("call jia")
     soup1 = BeautifulSoup(con1["value"])
     list = soup1.find_all("a", attrs={"class": "avatar-box text-center"})
     for la in list:  #每个演员
         #savecontent("\n@@@" + la.find("span").get_text())    #演员名@@@开始
         actor = la.find("span").get_text()
         link = la["href"]  #la.get("href")
         item = {"type": "2", "value": link, "actor": actor}
         #con1["value"] = link
         #con1["type"] = "2"
         #con1["actor"] = actor
         # logging.error("jia put" + link)
         self.url_queue.put(item)
     # 下一页
     next = soup1.find("a", attrs={"name": "nextpage"})
     if (next is not None):
         nextlink = next["href"]
         # logging.error("jia next =:" + nextlink)
         item = {"type": "1", "value": "https://avmo.pw" + nextlink}
         self.url_queue.put(item)
예제 #7
0
 def _run(self, offset):
     result = []
     data = {
         'method':
         'next',
         'params':
         '{"topic_id":%s,"offset":%s,"hash_id":""}' %
         (self.topic_id, offset),
         '_xsrf':
         '690bc39a80e99e2089d2e0cb0f504e0d'
     }
     resp = self.post(TOPIC_URL, data)
     msgs = loads(resp)['msg']
     for msg in msgs:
         soup = BeautifulSoup(msg)
         a = soup.find('a')
         link = a['href'].strip()
         result.append({
             '_id': link[link.rfind('/') + 1:],
             'link': ZHIHU_URL + link,
             'topic': a.getText().strip(),
             'icon': a.find('img')['src'].strip(),
         })
     return result