def save_data_to_mongodb(self,request): #新建类目表 #数据的话从文件里面读出 repo_id = request.POST['repo_id'] #create_id = request.POST['create_id'] file_id = request.POST['file_id'] try: news_col = Mongodb(db='knowledge', collection='text').get_collection() except Exception: return self.error("mongodb没有数据库或者表") try: ret_file_data = TDataAcquisitionLog.objects.get(id=file_id) except Exception: return self.error("id没有对应文件") ret_file_data_dict = model_to_dict(ret_file_data) file_name = ret_file_data_dict['data_source'] path_str = ret_file_data_dict['data_access'] try: data = xlrd.open_workbook(path_str + file_name) except Exception: return self.error("没有找到对应文件") table_name = data.sheet_names()[0] table = data.sheet_by_name(table_name) list_attribute = list(table.row_values(0)) list_json = [] row = table.nrows col = table.ncols for i in range(1, row): dict_data = {} for j in range(0, col): dict_data[list_attribute[j]] = table.row_values(i)[j] dict_data['file_id']=file_id x=news_col.insert_one(dict_data) ret_l={'context':'success'} return render(request, 'test1.html', context=ret_l)
class BaikeSpider(Driver): urls = [] # tags = ["电影", "演员", "导演", "编剧", "制片人"] count = 0 def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, proxy_ip_from="", spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy, proxy_ip_from=proxy_ip_from) # self.baike_col = Mongodb(db='movies1', collection="baike_member").get_collection() self.baike_col = Mongodb(db='baike', collection="test1").get_collection() def get_infos(self, url="", extensive_properties=None): if extensive_properties is None: extensive_properties = {} self.fast_new_page(url=url) relationship_urls = [] relationship_tags = [] if self.judge_web_element_exist_by_css_selector( css_selector= "div.polysemantList-header-title > div.toggle.expand"): synonym = self.until_presence_of_element_located_by_css_selector( css_selector= "div.polysemantList-header-title > div.toggle.expand > a") self.scroll_to_center(synonym) synonym.click() member_urls = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "ul.polysemantList-wrapper.cmn-clearfix > li.item > a") for item in member_urls: # for tag in self.tags: # if tag in item.text: relationship_urls.append(item.get_attribute("href")) relationship_tags.append(item.text) # break if self.driver.current_url not in self.urls: data = self.get_base_info_from_baike() if data is not None: current_tag = self.until_presence_of_element_located_by_css_selector( css_selector= "ul.polysemantList-wrapper.cmn-clearfix > li.item > span.selected" ) data.setdefault("tag", current_tag.text) data.update(extensive_properties) print(data) self.baike_col.insert_one(data) self.urls.append(self.driver.current_url) self.close_curr_page() for item in relationship_urls: if item not in self.urls: self.fast_new_page(url=item) data = self.get_base_info_from_baike() if data is not None: data.setdefault( "tag", relationship_tags[relationship_urls.index(item)]) data.update(extensive_properties) print(data) self.baike_col.insert_one(data) self.urls.append(item) self.close_curr_page() if self.count == 10: return False return True def get_base_info_from_baike(self): try: if not self.judge_web_element_exist_by_css_selector( css_selector= "div.content > div.main-content div.basic-info.cmn-clearfix" ): return basic_info_div = self.until_presence_of_element_located_by_css_selector( css_selector= "div.content > div.main-content div.basic-info.cmn-clearfix") if self.judge_web_element_exist_by_css_selector( ele=basic_info_div, css_selector="a.toggle.toExpand"): btn = self.until_presence_of_element_located_by_css_selector( ele=basic_info_div, css_selector="a.toggle.toExpand") self.scroll_to_center(btn) btn.click() basic_info_name = self.until_presence_of_all_elements_located_by_css_selector( css_selector="dl > dt.basicInfo-item.name", ele=basic_info_div) basic_info_value = self.until_presence_of_all_elements_located_by_css_selector( css_selector="dl > dd.basicInfo-item.value", ele=basic_info_div) data = {} for i in range(len(basic_info_name)): name = basic_info_name[i].text.replace(" ", "") value = basic_info_value[i].text if name == "" or value.replace(" ", "") == "": continue data.setdefault(name, value) data.setdefault("url", self.driver.current_url) if self.judge_web_element_exist_by_css_selector( css_selector="div.lemma-summary"): base_infos = self.until_presence_of_element_located_by_css_selector( css_selector="div.lemma-summary").text data.setdefault("基础信息", base_infos) self.count = 0 return data except Exception: self.count += 1
class MaoyanSpider(Driver): def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy) self.boxoffice_col = Mongodb(db='knowledge', collection='text').get_collection() self.news_col = Mongodb(db='movies1', collection='news').get_collection() @staticmethod def find_key_from_value(dict, value): key_list = dict.keys() for key in key_list: if value == dict[key]: return key return None def get_boxoffice_infos_from_one_page(self, url="", datetime="", user_id=-1, repo_id=-1): """ 获取猫眼此时刻票房数据 :param repo_id: :param user_id: :param datetime: :param url: :return: """ self.fast_new_page(url=url) time.sleep(1) if not self.judge_web_element_exist_by_css_selector( css_selector="div.dashboard-content"): self.close_curr_page() return True theads = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div.dashboard-list > table.dashboard-table.table-header > thead > tr > th" )[1:] theads = [item.text for item in theads] if not self.judge_web_element_exist_by_css_selector( css_selector= "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr" ): self.close_curr_page() return False boxoffice_infos = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr" ) crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) boxoffice_data_from_the_page = [] for item in boxoffice_infos: one_boxoffice_data = {} boxoffice_info = self.until_presence_of_all_elements_located_by_css_selector( css_selector="td", ele=item) movie_name = self.until_presence_of_element_located_by_css_selector( css_selector="div > div.moviename-desc > p.moviename-name", ele=boxoffice_info[0]) movie_info = self.until_presence_of_all_elements_located_by_css_selector( css_selector= "div > div.moviename-desc > p.moviename-info > span", ele=boxoffice_info[0]) one_boxoffice_data.setdefault("日期", datetime) one_boxoffice_data.setdefault("电影名", movie_name.text) one_boxoffice_data.setdefault("上映时间", movie_info[0].text) one_boxoffice_data.setdefault("总票房", movie_info[1].text) boxoffice_info = boxoffice_info[1:] for i in range(len(boxoffice_info)): one_boxoffice_data.setdefault(theads[i], boxoffice_info[i].text) one_boxoffice_data.setdefault("crawl_time", crwal_time) one_boxoffice_data.setdefault("crawl_from", "猫眼专业版") # self.piaofang_col.insert_one(one_piaofang_data) judge_result = self.judge_data_exist_by_keys( collection=self.boxoffice_col, keys={ "user_id": user_id, "repo_id": repo_id, "value.日期": one_boxoffice_data["日期"], "value.电影名": one_boxoffice_data["电影名"], "value.crawl_from": one_boxoffice_data["crawl_from"] }) if judge_result is True: boxoffice_data_from_the_page.append(one_boxoffice_data) else: return boxoffice_data_from_the_page, False self.close_curr_page() return boxoffice_data_from_the_page, True def get_boxoffice_infos(self, spider_id, user_id, repo_id, spider_name): date = datetime.datetime.strptime("2020-01-23", '%Y-%m-%d') # date = datetime.datetime.now() final_result = [] while True: data_list, result = self.get_boxoffice_infos_from_one_page( url="http://piaofang.maoyan.com/dashboard/movie?date=" + str(date)[:10], datetime=str(date)[:10], user_id=int(user_id), repo_id=int(repo_id)) final_result.extend(data_list) if result is False: break date = date + datetime.timedelta(days=-1) if len(final_result) == 0: return one_data_acquisition_log = TDataAcquisitionLog.objects.create( create_time=timezone.now(), data_source_name=spider_name, data_access="爬虫", repo_id=int(repo_id), create_id=int(user_id), data_path="") TEntityExtractionLog.objects.create( data_acquisition_id=one_data_acquisition_log.id, is_extract=0, entity_number=0, extract_time=timezone.now(), create_id=int(user_id), repo_id=int(repo_id)) for item in final_result: self.boxoffice_col.insert_one({ "file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id), "user_id": int(user_id), "repo_id": int(repo_id), "value": item }) def run_spider(self, url=""): lastest_info = self.boxoffice_col.find().sort("datetime", -1).limit(1) date = datetime.datetime.strptime(lastest_info[0]["datetime"], '%Y-%m-%d') date = date + datetime.timedelta(days=1) now = datetime.datetime.now() while date < now: self.get_boxoffice_infos_from_one_page( "http://piaofang.maoyan.com/dashboard/movie?date=" + str(date)[:10], str(date)[:10]) date = date + datetime.timedelta(days=1)
class DoubanSpider(Driver): # 爬取电影人的豆瓣url集合,用以筛去所有重复的url member_set = set() def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, proxy_ip_from="", spider_id='2', data_queue=None): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy, proxy_ip_from=proxy_ip_from) self.movie_col = Mongodb(db='knowledge', collection='text').get_collection() # self.member_col = Mongodb(db='movies', collection='member').get_collection() # self.comment_col = Mongodb(db='movies', collection="comments").get_collection() def get_member_info(self, url=""): """ 获取一个电影人的具体个人信息 :param url: :return: """ self.fast_new_page(url=url) if "条目不存在" in self.driver.title or "页面不存在" in self.driver.title: self.close_curr_page() return None name = self.driver.title[:-4].strip() member_data = {} member_data.setdefault("member_name", name) member_data.setdefault("douban_url", url) member_div_infos = self.until_presence_of_all_elements_located_by_css_selector("div.info > ul > li") for item in member_div_infos: item = item.text.split(":") key = item[0].strip() if len(item) > 2: value = ":".join(item[1:]) else: value = item[1] if key == "性别" or key == "星座" or key == "出生日期" or key == "出生地" or key == "官方网站": member_data.setdefault(key, value.strip()) else: member_data.setdefault(key, [item.strip() for item in value.split("/")]) self.close_curr_page() return member_data # self.member_col.insert_one(member_data) # self.info_log(data="取得个人资料数据----" + member_data["member_name"]) # return True def get_member_awards(self, url=""): """ 获取一个电影人曾经获得的所有荣誉 :param url: :return: """ self.fast_new_page(url=url) awards_div = self.until_presence_of_element_located_by_css_selector("div.grid-16-8.clearfix > div.article") result = [] try: awards_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="div.awards", ele=awards_div, timeout=5) except Exception: self.close_curr_page() return result for temp in awards_info: awards_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.hd > h2", ele=temp) awards = self.until_presence_of_all_elements_located_by_css_selector(css_selector="ul.award", ele=temp) for award in awards: data = {} award_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=award) data.setdefault("time", awards_time.text) data.setdefault("award_from", award_info[0].text) data.setdefault("award", award_info[1].text) data.setdefault("relevant_movie", award_info[2].text) result.append(data) self.close_curr_page() return result def get_member_movies(self, url=""): """ 获取一个电影人参与过的所有电影列表 :param url: :return: """ movies = [] self.fast_new_page(url=url) while True: movies_a = self.until_presence_of_all_elements_located_by_css_selector("div.article > div.grid_view > ul > li > dl > dd > h6 > a") for temp in movies_a: movies.append(temp.text) try: self.vertical_scroll_to() next_page = self.until_presence_of_element_located_by_css_selector("div.article > div.paginator > span.next > a", timeout=5) next_page.click() time.sleep(1) except Exception: self.close_curr_page() return movies def get_comments(self, url="", movie_name="", movie_id=None): """ 获取单页的20条评论信息 :param url: :param movie_name: :return: """ self.fast_new_page(url=url) if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title: self.close_curr_page() return comments_list = self.until_presence_of_all_elements_located_by_css_selector("div.article > div#comments.mod-bd > div.comment-item") if not self.judge_web_element_exist_by_css_selector(ele=comments_list[0], css_selector="div.comment"): self.close_curr_page() return for temp in comments_list: self.scroll_to_center(temp) data = {} commenter_name = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > a", ele=temp) commenter_useful = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-vote > span.votes", ele=temp) comment_content = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > p > span.short", ele=temp) comment_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > span.comment-time", ele=temp) data.setdefault("movie_name", movie_name) data.setdefault("nickname", commenter_name.text) data.setdefault("useful", commenter_useful.text) data.setdefault("time", comment_time.text) data.setdefault("content", comment_content.text) data.setdefault("comment_from", "douban.com") if movie_id is not None: data.setdefault("movie_id", movie_id) if self.judge_web_element_exist_by_css_selector(ele=temp, css_selector="div.comment > h3 > span.comment-info > span.rating"): commenter_evaluate = self.until_presence_of_element_located_by_css_selector( css_selector="div.comment > h3 > span.comment-info > span.rating", ele=temp) data.setdefault("evaluate", commenter_evaluate.get_attribute("title")) else: data.setdefault("evaluate", "") # self.comment_col.insert_one(data) self.close_curr_page() def get_one_movie_info(self, ele=None): """ 获取电影详细数据 :param url: :return: """ self.fast_click_page_by_elem(ele=ele) time.sleep(1) # self.fast_new_page(url=url) if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title: self.close_curr_page() return None try: actor_more = self.driver.find_element_by_css_selector("div#info > span.actor > span.attrs > a.more-actor") actor_more.click() mask = 1 except Exception: mask = 0 div_info = self.until_presence_of_element_located_by_css_selector(css_selector="div#info") infos = div_info.text info_list = infos.split("\n") movie_info = {} for info in info_list: info = info.split(":") key = info[0].strip() if len(info) == 1 or (len(info) == 2 and info[1] == ""): continue elif len(info) > 2: value = ":".join(info[1:]) else: value = info[1] if key == "官方网站": movie_info.setdefault(key, value.strip()) else: movie_info.setdefault(key, [item.strip() for item in value.split("/")]) # member_link = self.until_presence_of_all_elements_located_by_css_selector(css_selector="span span.attrs a", # ele=div_info) # if mask == 1: # member_link = member_link[:-1] # for item in member_link: # item_link = item.get_attribute("href") # if item_link in self.member_set: # continue # self.member_set.add(item_link) # actor_info = {"member_name": item.text, "douban_url": item_link} # self.dataQueue.put(actor_info) # self.close_curr_page() comment1 = self.until_presence_of_element_located_by_css_selector( "div#comments-section > div.mod-hd > h2 > span.pl > a") comment2 = self.until_presence_of_element_located_by_css_selector( "section#reviews-wrapper > header > h2 > span.pl > a") comment_number = int(re.findall(r'\d+', comment1.text)[0]) + int(re.findall(r'\d+', comment2.text)[0]) movie_info.setdefault("豆瓣评论数量", comment_number) self.close_curr_page() return movie_info def get_movie_infos(self, spider_id, user_id, repo_id, spider_name): self.fast_new_page( url="https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0") self.driver.refresh() if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title: self.close_curr_page() return None # category_ul = self.until_presence_of_element_located_by_css_selector("ul.category") # category = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=category_ul)[5:] # cur = 0 # description = category[cur].text # category[cur].click() time.sleep(1) css_selector = "div.list-wp a.item" elements_list = self.until_presence_of_all_elements_located_by_css_selector(css_selector=css_selector) final_result = [] for each in elements_list: data = {} self.vertical_scroll_to() time.sleep(1) self.scroll_to_center(ele=each) movie_link = each.get_attribute("href") movie_name = self.until_presence_of_element_located_by_css_selector(ele=each, css_selector="div.cover-wp > img") movie_score = self.until_presence_of_element_located_by_css_selector(ele=each, css_selector="p > strong") data.setdefault("电影名", movie_name.get_attribute("alt")) data.setdefault("豆瓣评分", movie_score.text) crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) data.setdefault("crawl_from", movie_link) data.setdefault("crawl_time", crwal_time) movie_info = self.get_one_movie_info(ele=each) movie_info.update(data) print(movie_info) final_result.append(movie_info) if len(final_result) == 0: return one_data_acquisition_log = TDataAcquisitionLog.objects.create(create_time=timezone.now(), data_source_name=spider_name, data_access="爬虫", repo_id=int(repo_id), create_id=int(user_id), data_path="") TEntityExtractionLog.objects.create(data_acquisition_id=one_data_acquisition_log.id, is_extract=0, entity_number=0, extract_time=timezone.now(), create_id=int(user_id), repo_id=int(repo_id)) for item in final_result: judge_result = self.judge_data_exist_by_keys(collection=self.movie_col, keys={"user_id": user_id, "repo_id": repo_id, "value.电影名": item["电影名"], "value.crawl_from": item["crawl_from"]}) if judge_result is True: self.movie_col.insert_one( {"file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id), "user_id": int(user_id), "repo_id": int(repo_id), "value": item}) # def run(self): # """ # 单个线程启动方法,对每一个队列中的数据的url进行解析,找到对应的方法进行爬取对应数据 # :return: # """ # self.info_log(data="线程启动", name=self.name) # count = 0 # while not self.dataQueue.empty() and count == 0: # temp = self.dataQueue.get(False) # url_path = urlparse(temp["douban_url"]).path # while True: # try: # if "/celebrity" in url_path: # # 获取一条电影人详细数据 # member_info = self.get_member_info(temp["douban_url"]) # if member_info is None: # print("人物数据不存在") # break # member_awards = self.get_member_awards(temp["douban_url"] + "awards") # member_movies = self.get_member_movies(temp["douban_url"] + "movies") # member_info.setdefault("awards", member_awards) # member_info.setdefault("acting_movies", member_movies) # self.member_col.insert_one(member_info) # self.info_log(data="成功获取并存储一条人物数据-----" + member_info["member_name"], name=self.threadName) # elif "/subject" in url_path and "/subject_search" not in url_path and "/comments" not in url_path: # # 获取一条电影数据,成功获取电影数据后将他的影评url数据压入队列 # movie_info = self.get_movie_info(temp["douban_url"]) # if movie_info is None: # print("电影数据不存在") # break # movie_info.update(temp) # self.movie_col.insert_one(movie_info) # self.info_log(data="成功获取并存储一条电影数据-----" + movie_info["movie_name"], name=self.threadName) # print(movie_info) # comments_url = temp["douban_url"] + "comments?start=0&limit=20&sort=new_score&status=P" # self.dataQueue.put({"movie_name": temp["movie_name"], "douban_url": comments_url, "movie_id": movie_info["_id"]}) # elif "/subject" in url_path and "/comments" in url_path: # # 对url解析,爬取200条影评数据 # bits = list(urlparse(temp["douban_url"])) # qs = parse_qs(bits[4]) # start = int(qs["start"][0]) # while start <= 200: # qs["start"][0] = start # bits[4] = urlencode(qs, True) # temp["douban_url"] = urlunparse(bits) # self.get_comments(temp["douban_url"], temp["movie_name"], temp["movie_id"]) # start += 20 # count = 0 # break # except Exception: # # 累计失败次数,每次失败后更换换代理ip,若连续失败5次则线程结束 # count += 1 # if count > 5: # self.dataQueue.put(temp) # break # self.change_ip(self.get_ip(self.proxy_ip_from)) @staticmethod def get_data_source(): """ 获取已获取的电影人url :return: """ member_col = Mongodb(db='movies', collection='member').get_collection() url_set = set() for item in member_col.find(): url_set.add(item["douban_url"]) return url_set
class MtimeSpider(Driver): def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False, spider_id='2'): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy) self.collection = Mongodb(db='knowledge', collection='text').get_collection() def get_news_from_one_page(self, ele=None): if ele is None: return None self.fast_click_page_by_elem(ele=ele) # self.fast_new_page(url) time.sleep(1) if self.judge_web_element_exist_by_css_selector( css_selector="p.newsinnerpageall > span > a"): show_all_page_btn = self.until_presence_of_element_located_by_css_selector( css_selector="p.newsinnerpageall > span > a") show_all_page_btn.click() try: news_title = self.until_presence_of_element_located_by_css_selector( css_selector="div.newsheader > div.newsheadtit").text news_time = re.findall( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", self.until_presence_of_element_located_by_css_selector( css_selector="div.newsheader > p.newstime").text)[0] news_source = self.until_presence_of_element_located_by_css_selector( css_selector="div.newsheader > p.newstime > span.ml15" ).text.split(":")[1] news_content = self.until_presence_of_element_located_by_css_selector( css_selector="div.newsnote").get_attribute( 'innerHTML' ) + self.until_presence_of_element_located_by_css_selector( css_selector="div#newsContent").get_attribute("innerHTML") news_author = \ self.until_presence_of_element_located_by_css_selector(css_selector="p.newsediter").text.split( ":")[1] except Exception: return None crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) one_news = {} one_news.setdefault("标题", news_title) one_news.setdefault("时间", news_time) one_news.setdefault("来源", news_source) one_news.setdefault("内容", news_content) one_news.setdefault("作者", news_author) one_news.setdefault("crawl_from", self.get_current_url()) one_news.setdefault("crwal_time", crwal_time) self.close_curr_page() return one_news def get_news_infos(self, spider_id, user_id, repo_id, spider_name): url = "http://news.mtime.com/movie/1/" self.fast_new_page(url=url) time.sleep(1) final_result = [] flag = 0 while True: while self.judge_web_element_exist_by_css_selector( css_selector="div.newscontent > div#leftNews > a#viewmore" ): more_info_btn = self.until_presence_of_element_located_by_css_selector( css_selector="div.newscontent > div#leftNews > a#viewmore") self.scroll_to_center(more_info_btn) more_info_btn.click() time.sleep(1) news_list = self.until_presence_of_all_elements_located_by_css_selector( css_selector="ul#newslist > li") for item in news_list: one_news = self.get_news_from_one_page(ele=item) if one_news is None: continue print(one_news) judge_result = self.judge_data_exist_by_keys( collection=self.collection, keys={ "user_id": user_id, "repo_id": repo_id, "value.crawl_from": one_news["crawl_from"] }) if judge_result: final_result.append(one_news) else: flag = 1 break if flag == 1 or not self.judge_web_element_exist_by_css_selector( css_selector="div#pages > a.cur + a"): break else: next_page_btn = self.until_presence_of_element_located_by_css_selector( css_selector="div#pages > a.cur + a") self.fast_click_page_by_elem(ele=next_page_btn) time.sleep(1) if len(final_result) == 0: return one_data_acquisition_log = TDataAcquisitionLog.objects.create( create_time=timezone.now(), data_source_name=spider_name, data_access="爬虫", repo_id=int(repo_id), create_id=int(user_id), data_path="") TEntityExtractionLog.objects.create( data_acquisition_id=one_data_acquisition_log.id, is_extract=0, entity_number=0, extract_time=timezone.now(), create_id=int(user_id), repo_id=int(repo_id)) for item in final_result: self.collection.insert_one({ "file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id), "user_id": int(user_id), "repo_id": int(repo_id), "value": item })