def craw_single_keyword(self, keyword): self.cur_key = keyword self.cur_key_seachcount = 0 search_url = self._btdepot_url + "/search/" + keyword r = requests.get(search_url, headers = self._headers, timeout=10) #print r.cookies soup = BeautifulSoup(r.content) ret = re.search(r'totalPages: \d*',r.content) print "match result ", ret if ret is None: print r.content totalPages = int(ret.group(0).split(':')[1].strip()) csv_name = self._csv_dir + "/breadsearch_" + time.strftime("%Y%m%d") + "_" + self.pid + ".csv" storer = my_csv_storer.my_csv_storer(csv_name) print 'totalPages:',totalPages for page in range(1,totalPages + 1): try: search_url = self._btdepot_url + "/search/" + keyword +"/" + str(page) r = requests.get(search_url, headers = self._headers, timeout=10) #print r.cookies soup = BeautifulSoup(r.content) item_list = soup.find_all("div", class_ = "search-item") for i in range(len(item_list)): temp = item_list[i].find_all('a')[0]["href"] info_url = self._btdepot_url + temp r = requests.get(info_url, headers = self._headers1, cookies = r.cookies, timeout=10) child_soup = BeautifulSoup(r.content) magnet_url = child_soup.find_all('ul', class_="prop-list" )[0].find_all('a')[0]['href'] print magnet_url hash_info = magnet_url[20:60] title = child_soup.find_all('h1', class_ = 'detail-title')[0].string print "==> title", title detailfiles = child_soup.find_all('ul',class_="file-list")[0].find_all('li') files = [ '-'.join(e.find_all('span')[0].strings) + " " + e.find_all('span')[1].string for e in detailfiles] content = title + "\n" + "\n".join(files) #print '===============>' #print (hash_info) #print (content.encode('utf8')) #print (magnet_url) #print '<===============' storer.store(unicode(hash_info).encode('utf8'), unicode(content).encode('utf8'), unicode(magnet_url).encode('utf8')) self.cur_key_seachcount += 1 except Exception, e: print "found exception", e traceback.print_exc()
def craw_single_keyword(self, keyword): self.cur_key = keyword self.cur_key_seachcount = 0 search_url = self._btdepot_url + "/search/" + keyword r0 = requests.get(search_url, headers=self._headers, timeout=10) # print r0.cookies soup = BeautifulSoup(r0.content) ret = re.search(r"totalPages: \d*", r0.content) print "match result ", ret if ret is None: print r0.content totalPages = int(ret.group(0).split(":")[1].strip()) csv_name = self._csv_dir + "/btdepot_" + time.strftime("%Y%m%d") + "_" + self.pid + ".csv" storer = my_csv_storer.my_csv_storer(csv_name) print "totalPages:", totalPages time.sleep(0.5) for page in range(1, totalPages + 1): try: search_url = self._btdepot_url + "/search/" + keyword + "/" + str(page) r = requests.get(search_url, headers=self._headers, timeout=10) # print r.cookies soup = BeautifulSoup(r.content) item_list = soup.find_all("div", class_="item_container") time.sleep(0.5) for i in range(len(item_list)): if i == 0: continue temp = item_list[i].a["href"] # print "===>",i, " " , temp info_url = self._btdepot_url + temp r = requests.get(info_url, headers=self._headers1, cookies=r0.cookies, timeout=10) child_soup = BeautifulSoup(r.content) magnet_url = child_soup.find_all("textarea")[0].string print magnet_url size = child_soup.find_all("span", string="Size: ")[0].next_sibling.string files = child_soup.find_all("span", string="Files: ")[0].next_sibling.string index_date = child_soup.find_all("span", string="Index Date: ")[0].next_sibling.string hash_info = child_soup.find_all("span", string="Hash: ")[0].next_sibling.string title = child_soup.find_all("h1", class_="torrent_title")[0].string detailfiles = child_soup.find_all("div") files = [] for d in detailfiles: if d.has_attr("style") and d["style"] == "margin-bottom: 50px;": fnn = d.find_all("div") files = [ "-".join(e.find_all("span")[0].strings) + " " + e.find_all("span")[1].string for e in fnn ] content = title + "\n" + "\n".join(files) # print '===============>' # print type(hash_info) # print type(content.encode('utf8')) # print type(magnet_url) # print '<===============' storer.store( unicode(hash_info).encode("utf8"), unicode(content).encode("utf8"), unicode(magnet_url).encode("utf8"), ) self.cur_key_seachcount += 1 except Exception, e: print "found exception", e traceback.print_exc()