def __init__(self, token: str, proxies: dict = None, delay=DELAY, retry=RETRY, logger=None, session: Session = None): self.base_url = 'https://api.twitter.com/1.1/' self.headers = {'Authorization': token} self.logger = logger if logger is not None else Log.create_logger('TwitterSpider', './twitter.log') self.delay = delay self.session = Session(retry=retry, proxies=proxies) if session is None else session
def __init__(self): self.base_url = 'https://www.imdb.com/' self.session = Session(retry=5) self.pattern = { 'title_and_year': re.compile(r'(.+?)\s+\(([0-9]+)\)'), 'number': re.compile(r'[\d,]+'), 'space': re.compile(r'\s+') } self.log = Log.create_logger('IMDBSpider', './imdb.log')
def __init__(self, db: Database = None, path: PathGenerator = None, session: Session = None, auth: Auth = None): self.db = MongoDB('weibo', primary_key='id') if db is None else db self.path = StoreByUserName('./download') if path is None else path self.session = Session(timeout=10, retry=5) \ if session is None else session if auth is None: auth = Auth() self.token = auth.token.token self.client = Client()
class TwitterDownloader: def __init__(self, path: PathGenerator, proxies: dict = None, retry=RETRY, logger=None, session: Session = None): self.path = path self.logger = logger if logger is not None else Log.create_logger('TwitterSpider', './twitter.log') self.session = Session(proxies=proxies, retry=retry) if session is None else session def _get(self, url): r = self.session.get(url=url) return r.content def _save(self, content, path): if os.path.exists(path): self.logger.warning('File %s exists.', path) return False with open(path, 'wb') as f: f.write(content) return True def download(self, tweet: Tweet): user = tweet.user for media in tweet.medias: # def path(self, file_name, media_type, media_id, media_url, user_id, user_name, screen_name) path = self.path.path(file_name=media.file_name, media_type=media.type, media_id=media.id, media_url=media.url, user_id=user.id, user_name=user.name, screen_name=user.nickname) self._save(self._get(media.url), path)
def __init__(self, path: PathGenerator = None, proxies: dict = None, retry=RETRY, logger=None, session: Session = None): if path is None: self.path = StoreByUserName('./download') elif type(path) is str: self.path = StoreByUserName(path) else: self.path = path self.logger = Log.create_logger( 'TwitterSpider', './twitter.log') if logger is None else logger self.session = Session(proxies=proxies, retry=retry) if session is None else session
class DoubanSpider: def __init__(self, session: Session = None): self.session = Session(retry=5, timeout=10) if session is None else session self.ENABLE_BROTLI = version.parse(urllib3.__version__) < version.parse('1.25.1') def list(self, tags: List[str] = None, sort: str = 'U', start: int = 0, limit: int = 100000): """ Return the list of URLs. :param sort: U - 近期热门, T - 标记最多, S - 评分最高, R - 最新上映 :param tags: All the tags showed on the page :param start: start offset :param limit: limit to end :return: """ url = 'https://movie.douban.com/j/new_search_subjects' while start < limit: params = { 'sort': sort, 'range': '0, 10', 'tags': ','.join(tags) if tags is not None else '', 'start': start } text = self._get(url, params=params, headers=HEADERS['api']) data = json.loads(text)['data'] for item in data: yield item['url'] time.sleep(2) def _get(self, url, **kwargs): r = self.session.get(url, **kwargs) if r.headers['Content-Encoding'] == 'br' and self.ENABLE_BROTLI: return brotli.decompress(r.content).decode('utf-8') else: return r.text def access_brief(self, url): """ Crawl the brief page :param url: :return: """ text = self._get(url, headers=HEADERS['page']) soup = Soup(text, 'lxml') content = soup.find('div', id='content') selector = Selector(text) return content, selector def access_celebrity(self, movie_id): pass def access_comment(self, movie_id, start=0, sort='new_score', status='P'): pass def access_review(self, movie_id, start=0): pass def access_full_text(self, url): pass
class WeiboSpider: def __init__(self, db: Database = None, path: PathGenerator = None, session: Session = None, auth: Auth = None): self.db = MongoDB('weibo', primary_key='id') if db is None else db self.path = StoreByUserName('./download') if path is None else path self.session = Session(timeout=10, retry=5) \ if session is None else session if auth is None: auth = Auth() self.token = auth.token.token self.client = Client() def list(self, page=1): items = [] running = True while running: data = self.client.favorites.get(access_token=self.token, page=page) if len(data.favorites) <= 0: break for item in data.favorites: if item.status.id not in self.db: items.append(item.status) else: running = False break page += 1 items.reverse() return items def download(self, status): if 'deleted' not in status: user = status.user.name for item in status.pic_urls: url = item.thumbnail_pic.replace('thumbnail', 'large') path = self.path.generate(user_name=user, media_type=MediaType.image) r = self.session.get(url) with open(path, 'wb') as f: f.write(r.content) self.db.add(status.id)
def __init__(self, session: Session = None): self.session = Session(retry=5, timeout=10) if session is None else session
def __init__(self, path: PathGenerator, proxies: dict = None, retry=RETRY, logger=None, session: Session = None): self.path = path self.logger = logger if logger is not None else Log.create_logger('TwitterSpider', './twitter.log') self.session = Session(proxies=proxies, retry=retry) if session is None else session
class GBDEXSpider: def __init__(self): self.session = Session(timeout=10) def crawl_api(self): param = { 'parentIndustryId': '', 'industryId': '', 'cityId': '', 'areaId': '', 'dataType': 0, 'sort': 'updateTime', 'order': 'desc', 'keyword': '', 'page': 1, 'limit': 1000 } url = 'http://trade.gbdex.com/trade.web/accurateData/filterDataByAjax' r = self.session.post(url, params=param) json_data = json.loads(r.text) link_list = json_data['rows'] if 'rows' in json_data else list() print('GDBEX - API: {0} items total in list'.format(len(link_list))) tqdm_list = tqdm(link_list) for link in tqdm_list: tqdm_list.set_description('Crawling: {}'.format(link['id'])) try: yield self._crawl_api_item(link['id']) except Timeout: continue def _crawl_api_item(self, product_id): url = 'http://trade.gbdex.com/trade.web/api/apishow' param = {'productID': product_id} r = self.session.get(url, params=param) s = Soup(r.text, 'lxml') api = dict() api['标题'], api['简介'], api['价格'], api['数据类型'], api['覆盖区域'], api['点击量'], api[ '收藏量'] = self._parse_brief(s) api['API'] = self._parse_list(s) api['代码示例'] = self._parse_code_demo(s) api['id'] = product_id return api @staticmethod def _parse_brief(s): detail_info_r = s.find('div', class_='detailInfo-R') datum_title = detail_info_r.find('h3', class_='datumTitle') title = datum_title.text if datum_title is not None else None data_content_hidden = detail_info_r.find('p', id='dataContentHidden') brief = data_content_hidden.text if data_content_hidden is not None else None product_price = detail_info_r.find('span', id='productPrice') price = re.sub(r'\s+', '', product_price.text) if product_price is not None else None product_profile = detail_info_r.find('ul', class_='product-profile clearfix') if product_profile is not None: li_list = product_profile.find_all('li') data_type = li_list[1].text.split(':')[1] area = li_list[2].text.split(':')[1] else: combo_set = detail_info_r.find('div', class_='combo-set comboLeft') p_list = combo_set.find_all('p') data_type = p_list[0].text.split(':')[1] area = p_list[2].text.split(':')[1] detail_info_l = s.find('div', class_='detailInfo-l') target = detail_info_l.find('div', class_='target') span_list = target.find_all('span') click_count = int(re.sub(r'\s+', '', span_list[0].text)) fav_count = int(re.sub(r'\s+', '', span_list[2].text)) return title, brief, price, data_type, area, click_count, fav_count @staticmethod def _parse_list(s): api_title = [] div = s.find('div', id='definite0') section = div.find('section', class_='clearfix base-boder') base_info = section.find('div', class_='baseInfo') ul = base_info.find(id='navUl') for li in ul.find_all('li'): api_title.append(li['title']) api_list = [] for idx, page in enumerate(s.find_all('div', class_='baseContent fl')): api = {} table_list = page.find_all('table') if len(table_list) <= 0: continue table = table_list[0] tr_list = table.find_all('tr') l = [] for tr in tr_list: td = tr.find_all('td')[1] l.append(re.sub(r'\s+', '', td.text)) api['接口地址'], api['请求类型'], api['返回数据格式'], \ api['数据总量'], api['是否收费'] = l[0], l[1], l[2], l[3], l[4] table = table_list[1] t_body = table.find('tbody') tr_list = t_body.find_all('tr') api['输入字段'] = [] for tr in tr_list: td_list = tr.find_all('td') input_data = {'描述': td_list[0].text, '参数名称': td_list[1].text, '数据类型': td_list[2].text, '默认值': td_list[3].text, '是否必填': td_list[4].text} api['输入字段'].append(input_data) table = table_list[2] t_body = table.find('tbody') tr_list = t_body.find_all('tr') api['返回字段'] = [] for tr in tr_list: td_list = tr.find_all('td') return_data = {'描述': td_list[0].text, '参数名称': td_list[1].text, '数据类型': td_list[2].text, '默认值': td_list[3].text, '是否必填': td_list[4].text} api['返回字段'].append(return_data) api['API名称'] = api_title[idx] api_list.append(api) return api_list @staticmethod def _parse_code_demo(s): code_demo = {} div = s.find('div', id='definite2') ul = div.find('ul', id='tab-dm') if ul is None: return None li_list = ul.find_all('li') if li_list is None: return None code_type_list = list(li.text for li in li_list) sample_div = div.find('div', class_='definiteContent sample') if sample_div is None: return None code_span_list = sample_div.find_all('span') if code_span_list is None: return None if len(code_type_list) != len(code_span_list): print('Count of type ({0}) is not eqaul to count of code span ({1})'.format(len(code_type_list), len(code_span_list))) for idx, code in enumerate(code_span_list): code_demo[code_type_list[idx]] = code.text return code_demo def crawl_data_file(self): param = { 'parentIndustryId': '', 'industryId': '', 'cityId': '', 'areaId': '', 'dataType': 1, 'sort': 'updateTime', 'order': 'desc', 'keyword': '', 'page': 1, 'limit': 3000 } url = 'http://trade.gbdex.com/trade.web/accurateData/filterDataByAjax' r = requests.post(url, params=param, timeout=10) json_data = json.loads(r.text) link_list = json_data['rows'] if 'rows' in json_data else list() print('GDBEX - Data File: {0} items total in list'.format(len(link_list))) tqdm_list = tqdm(link_list) for link in tqdm_list: tqdm_list.set_description('Crawling: {}'.format(link['id'])) try: yield self._crawl_data_file_item(link['id']) except Timeout: continue def _crawl_data_file_item(self, product_id): url = 'http://trade.gbdex.com/trade.web/dataReport/reportPayForProduct' param = {'productID': product_id} r = requests.get(url, params=param, timeout=10) s = Soup(r.text, 'lxml') item = dict() item['标题'], item['简介'], item['价格'], item['数据类型'], item['覆盖区域'], item['点击量'], item[ '收藏量'] = self._parse_brief(s) item['详细信息'] = self._parse_file_info(s) item['ID'] = product_id return item @staticmethod def _parse_file_info(s): table = s.find('table') info = {} for tr in table.find_all('tr'): td_list = tr.find_all('td') info[td_list[0].text] = re.sub(r'\s+', '', td_list[1].text) return info
class IMDBSpider: def __init__(self): self.base_url = 'https://www.imdb.com/' self.session = Session(retry=5) self.pattern = { 'title_and_year': re.compile(r'(.+?)\s+\(([0-9]+)\)'), 'number': re.compile(r'[\d,]+'), 'space': re.compile(r'\s+') } self.log = Log.create_logger('IMDBSpider', './imdb.log') def __url(self, url): return urlparse.urljoin(self.base_url, url) def __text(self, text, repl=' '): return self.pattern['space'].sub(repl, text).strip() @staticmethod def __str2int(string: str): return int(string.replace(',', '')) @staticmethod def __percent2float(string: str): return float(string.strip('%')) / 100 def top250(self, start_from=None): r = self.session.get( urlparse.urljoin(self.base_url, 'chart/top?ref_=nv_mv_250')) s = Soup(r.text, 'lxml') table = s.find('table') tbody = table.find('tbody') tr_list = tbody.find_all('tr') start = start_from is None for tr in tr_list: title_col = tr.find('td', class_='titleColumn') a = title_col.find('a') title = a.text link = a.attrs['href'] if not start: if link == start_from: start = True if start: yield title, link def crawl(self, link): title, year, rating, num_rating, short_summary, metascore, review_user, review_critic, num_awards, num_video, num_image, story_line, tag_line, mpaa, genre, details, num_review = self._main( link) summary, synopsis = self._plot(link) keywords = self._keyword(link) awards = self._awards(link) casts = self._cast(link) spec = self._tech_spec(link) trivia = self._trivia(link) quotes = self._quotes(link) goofs = self._goofs(link) connections = self._connections(link) faq = self._faq(link) rating_detail = self._user_rating(link) companies = self._company_credits(link) return { key: value for key, value in locals().items() if key[0] != '_' and key != 'self' } def _main(self, link): r = self.session.get(self.__url(link)) s = Soup(r.text, 'lxml') # Title and year title_wrapper = s.find('div', class_='title_wrapper') h1 = title_wrapper.find('h1') title_and_year = h1.text res = self.pattern['title_and_year'].findall(title_and_year) title = res[0][0] year = self.__str2int(res[0][1]) # ratings rating_wrapper = s.find('div', class_='ratings_wrapper') rating_value = rating_wrapper.find('div', class_='ratingValue') strong = rating_value.find('strong') span = strong.find('span') rating = float(self.__text(span.text)) a = rating_value.find_next('a') num_rating = self.__str2int(self.__text(a.text)) # Short summary plot_summary_wrapper = s.find('div', class_='plot_summary_wrapper') summary_text = plot_summary_wrapper.find('div', class_='summary_text') short_summary = summary_text.text.strip() if summary_text else None # metascore title_review_bar = plot_summary_wrapper.find('div', class_='titleReviewBar') if title_review_bar is not None: metascore_div = title_review_bar.find('div', class_='metacriticScore') if metascore_div is not None: span = metascore_div.find('span') metascore = self.__str2int(span.text) else: metascore = None else: metascore = None # num of review user and critic if title_review_bar is not None: reviews_div = title_review_bar.find( 'div', class_='titleReviewBarItem titleReviewbarItemBorder') a_list = reviews_div.find_all('a') review_user, review_critic = [ self.__str2int(self.pattern['number'].search(a.text).group()) for a in a_list ] else: review_user, review_critic = None, None # num of awards title_awards_ranks = s.find('div', id='titleAwardsRanks') num_awards = [] strong = title_awards_ranks.find('strong') if strong is not None: num_awards.append(strong.text.strip()) span_list = title_awards_ranks.find_all('span', class_='awards-blurb') for span in span_list: num_awards.append(self.__text(span.text, ' ')) # num of videos title_video_strip = s.find('div', id='titleVideoStrip') if title_video_strip is not None: see_more = title_video_strip.find( 'div', class_='combined-see-more see-more') a = see_more.find('a') num_video = int(self.pattern['number'].search(a.text).group()) else: num_video = 0 # num of images title_image_strip = s.find('div', id='titleImageStrip') if title_image_strip is not None: see_more = title_image_strip.find( 'div', class_='combined-see-more see-more') if see_more is not None: a = see_more.find_all('a')[1] num_image = int(self.pattern['number'].search(a.text).group()) else: num_image = 0 else: num_image = 0 # short story line title_story_line = s.find('div', id='titleStoryLine') div = title_story_line.find('div', class_='inline canwrap') span = div.find('span') story_line = span.text.strip() # tagline and mpaa txt_block_list = title_story_line.find_all('div', class_='txt-block') tag_line_div = txt_block_list[0] tag_line = self.__text(tag_line_div.contents[2]) mpaa_div = txt_block_list[1] span = mpaa_div.find('span') mpaa = self.__text(span.text, ' ') # genre see_more_list = title_story_line.find_all( 'div', class_='see-more inline canwrap') genre_div = see_more_list[1] genre = list(self.__text(a.text, '') for a in genre_div.find_all('a')) # details title_details = s.find('div', id='titleDetails') details = {} for txt_block in title_details.find_all('div', class_='txt-block'): h4 = txt_block.find('h4', class_='inline') if h4 is not None: text = self.__text(txt_block.text) if text.find('See more') > 0: text = text[text.find(':') + 1:text.find('See more')].strip() else: text = text[text.find(':') + 1:].strip() details[self.__text(h4.text)] = text title_user_review = s.find('div', id='titleUserReviewsTeaser') div = title_user_review.find('div', class_='yn') num_review = self.__str2int(self.pattern['number'].search( div.find_next('a').find_next('a').find_next('a').text).group()) return title, year, rating, num_rating, short_summary, metascore, review_user, review_critic, num_awards, num_video, num_image, story_line, tag_line, mpaa, genre, details, num_review def _plot(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'plotsummary')) s = Soup(r.text, 'lxml') # summary h4 = s.find('h4', id='summaries') ul = h4.find_next('ul') summary = [li.find('p').text for li in ul.find_all('li')] # synopsis h4 = s.find('h4', id='synopsis') ul = h4.find_next('ul') synopsis = [li.text for li in ul.find_all('li')] return summary, synopsis def _keyword(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'keywords')) s = Soup(r.text, 'lxml') div = s.find('div', id='keywords_content') table = div.find('table') tbody = table.find('tbody') keywords = [] for td in tbody.find_all('td'): if td is not None: a = td.find('a') if a is not None: keywords.append(a.text) return keywords def _awards(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'awards')) s = Soup(r.text, 'lxml') main = s.find('div', id='main') awards = [] for h3 in main.find_all('h3')[1:]: title = self.__text(h3.next) year = self.__text(h3.find('a').text) awards.append({'title': title, 'year': year}) return awards def _cast(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'fullcredits')) s = Soup(r.text, 'lxml') div = s.find('div', id='fullcredits_content') casts = {} for h4 in div.find_all('h4'): category = self.__text(h4.text) table = h4.find_next('table') cast_list = [] if 'class' in table.attrs and 'cast_list' in table.attrs['class']: for tr in table.find_all('tr'): if 'class' in tr.attrs: a = tr.find_next('td').find_next('td').find('a') name = self.__text(a.text) td = tr.find('td', class_='character') credit = self.__text(td.text) cast_list.append({'name': name, 'credit': credit}) else: tbody = table.find('tbody') for tr in tbody.find_all('tr'): td = tr.find('td', class_='name') name = self.__text(td.text) if td is not None else None td = tr.find('td', class_='credit') credit = self.__text(td.text) if td is not None else None cast_list.append({'name': name, 'credit': credit}) casts[category] = cast_list return casts def _tech_spec(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'technical')) s = Soup(r.text, 'lxml') div = s.find('div', id='technical_content') table = div.find('table') tbody = table.find('tbody') spec = {} for tr in tbody.find_all('tr'): td = tr.find('td') label = self.__text(td.text) td = td.find_next('td') value = self.__text(td.text) spec[label] = value return spec def _trivia(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'trivia')) s = Soup(r.text, 'lxml') div = s.find('div', id='trivia_content') trivia = [] for text_list in div.find_all('div', class_='list'): for soda_text in text_list.find_all('div', class_='sodatext'): trivia.append(self.__text(soda_text.text)) return trivia def _quotes(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'quotes')) s = Soup(r.text, 'lxml') quote = [] div = s.find('div', id='quotes_content') for quote_list in div.find_all('div', class_='list'): for soda_text in quote_list.find_all('div', class_='sodatext'): quote.append(self.__text(soda_text.text)) return quote def _goofs(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'goofs')) s = Soup(r.text, 'lxml') goofs = [] div = s.find('div', id='goofs_content') for soda_text in div.find_all('div', class_='sodatext'): goofs.append(self.__text(soda_text.text)) return goofs def _connections(self, link): r = self.session.get( urlparse.urljoin(self.__url(link), 'movieconnections')) s = Soup(r.text, 'lxml') div = s.find('div', id='connections_content') connections = {} category = None for soda_list in div.find_all('div', class_='list'): if 'id' in soda_list.attrs and soda_list.attrs[ 'id'] == 'no_content': return None for soda in soda_list.find_all('div', class_='soda'): last = soda.find_previous() if last.name == 'h4': category = self.__text(last.text) if category not in connections: connections[category] = [] connections[category].append(self.__text(soda.text)) return connections def _faq(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'faq')) s = Soup(r.text, 'lxml') div = s.find('div', id='main') faq = [] IDs = ['faq-no-spoilers', 'faq-spoilers'] for ID in IDs: section = div.find('section', id=ID) ul = section.find('ul') for li in ul.find_all('li'): question_div = li.find('div', class_='faq-question-text') if question_div is None: continue question = self.__text(question_div.text) answer_div = li.find('div', class_='ipl-hideable-container') if answer_div is None: continue p = answer_div.find('p') answer = self.__text(p.text) faq.append({'question': question, 'answer': answer}) return faq def _user_rating(self, link): r = self.session.get(urlparse.urljoin(self.__url(link), 'ratings')) s = Soup(r.text, 'lxml') div = s.find('div', id='main') table = div.find('table') rating = {str(i): {} for i in range(1, 11)} for idx, div in enumerate(table.find_all('div', class_='topAligned')): rating[str(10 - idx)]['percent'] = self.__percent2float( self.__text(div.text)) for idx, div in enumerate( table.find_all('div', class_='leftAligned')[1:]): rating[str(10 - idx)]['count'] = self.__str2int( self.__text(div.text)) return rating def _company_credits(self, link): r = self.session.get( urlparse.urljoin(self.__url(link), 'companycredits')) s = Soup(r.text, 'lxml') div = s.find('div', id='company_credits_content') companies = {} for h4 in div.find_all('h4', class_='dataHeaderWithBorder'): category = self.__text(h4.text) credit = [] for li in h4.find_next('ul').find_all('li'): credit.append(self.__text(li.text)) companies[category] = credit return companies
def __init__(self, session: Session = None): self.session = Session(retry=5, timeout=10) if session is None else session self.ENABLE_BROTLI = version.parse(urllib3.__version__) < version.parse('1.25.1')
class JuheSpider: def __init__(self): self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' self.session = Session(retry=10, timeout=20) def crawl_api(self, total_page=6, start_from=None): url = 'https://www.juhe.cn/docs/index/page/{0}' start = start_from is None for page in range(total_page + 1): r = self.session.get(url.format(page)) s = Soup(r.text, 'lxml') ul = s.find('ul', class_='api-list-ul') for li in ul.find_all('li', class_='api-list-li'): a = li.find('a') link = a['href'] if not start: if link == start_from: start = True continue # h2 = a.find('h2', class_='api-name') # title = h2.text # div = li.find('div', class_='api-price') # price = div.text if div is not None else None # p = li.find('p', class_='api-marks') # desc = p.text if p is not None else None yield link, self._crawl_api_item(link) def _crawl_api_item(self, link): url = 'https://www.juhe.cn{0}'.format(link) num = link.split('/')[-1] r = self.session.get(url) s = Soup(r.text, 'lxml') ul = s.find('ul', class_='api-pp') temp = [] for li in ul.find_all('li'): div = li.find('div') temp.append(div.text) api_item = { 'ID': temp[0], 'API请求次数': temp[1], '收藏量': temp[2] } api_infos = s.find('div', class_='api-infos') h1 = api_infos.find('h1') api_item['标题'] = h1.text if h1 is not None else None tags = api_infos.find_all('span') api_item['标签列表'] = list(span.text for span in tags) if tags is not None else None api_des_info = api_infos.find('p', class_='api-des-info') desc = api_des_info.text api_item['简介'] = desc product_content = s.find('div', class_='product-content') if product_content is not None: product_aways = product_content.find('div', class_='product-aways') api_item['功能介绍'] = product_aways.text api_url_list = [] doc_api_area = s.find('div', id='docs-api-area') ul = doc_api_area.find('ul') li_list = ul.find_all('li') for li in li_list: a = li.find('a') api_title = a.text api_url = a['href'] api_url_list.append({'title': api_title, 'url': api_url + '/' + num}) price_url = 'https://www.juhe.cn/docs/api/packages/{0}'.format(num) r = self.session.get(price_url) result = json.loads(r.text) html = result['result']['html'] s = Soup(html, 'lxml') ul = s.find('ul') li_list = ul.find_all('li') prices = [] for li in li_list: price = li['data-price'] if 'data-price' in li.attrs else None tag = re.sub(r'\s+', '', li.text) if price is not None or tag is not None: prices.append({'price': price, 'tag': tag}) api_item['价格'] = prices api_list = [] for api_url in api_url_list: api = { 'title': api_url['title'] } api_url = 'https://www.juhe.cn{0}'.format(api_url['url']) r = self.session.get(api_url) result = json.loads(r.text)['result'] s = Soup(result['html'], 'lxml') div_list = s.find_all('div', class_='simpleline') for i in range(len(div_list) - 1): div = div_list[i] tag_and_content = div.text.split(':', 1) if len(tag_and_content) > 1: api[tag_and_content[0]] = tag_and_content[1].strip().strip('\n') idx = 0 div_list = s.find_all('div', class_='simpleTable') div = div_list[idx] p = div.find('p') tag = p.text.strip(':') if tag == '请求参数说明': params = [] table = div.find('table') tr_list = table.find_all('tr') header_tr = tr_list[0] headers = [] for th in header_tr.find_all('th')[1:]: headers.append(th.text) for tr in tr_list[1:]: param = {} for index, td in enumerate(tr.find_all('td')[1:]): param[headers[index]] = td.text params.append(param) api[tag] = params idx = min(idx + 1, len(div_list) - 1) codes = [] div = div_list[idx] p = div.find('p') tag = p.text.strip(':') if tag == '请求代码示例': for code in div.find_all('code'): language = code['class'][0] content = code.text codes.append({'language': language, 'code': content}) api[tag] = codes idx = min(idx + 1, len(div_list) - 1) return_param = [] div = div_list[idx] p = div.find('p') tag = p.text.strip(':') if tag == '返回参数说明': table = div.find('table') tr_list = table.find_all('tr') header_tr = tr_list[0] headers = [] for th in header_tr.find_all('th')[1:]: headers.append(th.text) for tr in tr_list[1:]: param = {} for index, td in enumerate(tr.find_all('td')[1:]): param[headers[index]] = td.text return_param.append(param) api[tag] = return_param idx = min(idx + 1, len(div_list) - 1) div = div_list[idx] p = div.find('p') tag = p.text.strip(':') if tag == 'JSON返回示例': return_example = div.find('code').text api[tag] = return_example api_list.append(api) api_item['API'] = api_list error_code_url = 'https://www.juhe.cn/docs/api/errorCode/{}'.format(num) r = self.session.get(error_code_url) result = json.loads(r.text) api_item['错误码'] = result['result'] if 'result' in result else None return api_item def crawl_data(self, start_from=None): url = 'https://www.juhe.cn/market' start = start_from is None r = self.session.get(url) s = Soup(r.text, 'lxml') ul = s.find('ul', class_='api-list-ul') for li in ul.find_all('li', class_='api-list-li'): a = li.find('a') link = a['href'] if not start: if link == start_from: start = True continue # h2 = a.find('h2', class_='api-name') # title = h2.text yield link, self._crawl_data_item(link) def _crawl_data_item(self, link): r = self.session.get(link) s = Soup(r.text, 'lxml') block_main_info = s.find('div', class_='block-main-info') data_item = {'标题': block_main_info.find('h1').text, '简介': block_main_info.find('p', class_='block-main-desc').text} block_main_detail = block_main_info.find('div', class_='block-main-detail') dl_list = block_main_detail.find_all('dl') for dl in dl_list: dt = dl.find('dt') dd = dl.find('dd') data_item[dt.text] = dd.text ul = s.find('ul', class_='block-main-list') temp = [] for li in ul.find_all('li'): span = li.find('span') temp.append(span.text) data_item.update({ 'ID': temp[0], '下载次数': temp[1], '收藏量': temp[2] }) table = s.find('table', class_='block-table') tr_list = table.find_all('tr') header_tr = tr_list[0] headers = [] for td in header_tr.find_all('td')[1:]: headers.append(td.text) data_list = [] for tr in tr_list[1:]: data = {} for idx, td in enumerate(tr.find_all('td')[1:]): data[headers[idx]] = td.text data_list.append(data) data_item['数据列表'] = data_list return data_item
def __init__(self): self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' self.session = Session(retry=10, timeout=20)
def __init__(self): self.session = Session(timeout=10)
class CboooSpider: def __init__(self): self.session = Session() self.logger = Log.create_logger('spider') def get_id(self, redis: RedisSet): start_page = config.Config.spider.start_page params = { 'area': config.Config.spider.area, 'type': 0, 'year': 0, 'initial': '全部', 'pIndex': start_page } res = self.session.get(url=url('/Mdata/getMdata_movie'), params=params) data = json.loads(res.text) self.logger.info('Total: {0} pages, {1} items'.format( data['tPage'], data['tCount'])) end_page = data['tPage'] for item in data['pData']: redis.add(item['ID']) self.logger.info('Page {}'.format(start_page)) time.sleep(10) for i in range(start_page + 1, end_page + 1): params['pIndex'] = i res = self.session.get(url=url('/Mdata/getMdata_movie'), params=params) data = json.loads(res.text) for item in data['pData']: redis.add(item['ID']) self.logger.info('Page {}'.format(i)) time.sleep(10) def start_crawl(self, extractor: Extractor, redis: RedisSet, mongo: MongoDB): while not redis.empty(): movie_id = redis.pop() self.logger.info('Movie ID: {}'.format(movie_id)) try: info = self._crawl(movie_id, extractor) if info is not None: if mongo.count({'id': movie_id}) <= 0: mongo.insert(info) else: self.logger.info( 'Duplicate record {}'.format(movie_id)) else: self.logger.warning('Useless record {}'.format(movie_id)) except NetworkException as e: self.logger.error(e) redis.add(movie_id) time.sleep(10) def _crawl(self, movie_id, extractor: Extractor): retry = MAX_RETRY while retry: try: res = self.session.get(url=url('/m/{}'.format(movie_id))) info = extractor.extract_info(res.text) if info is None: return None res = self.session.get(url=url( '/Mdata/getMovieEventAll?movieid={}'.format(movie_id))) info['event'] = extractor.extract_events(res.text) info['id'] = movie_id return info except (NetworkException, AttributeError) as e: self.logger.error(str(e)) retry -= 1 if retry <= 0: raise RetryLimitExceededException(movie_id) from e
def __init__(self): self.session = Session() self.logger = Log.create_logger('spider')
class TwitterSpider: def __init__(self, token: str, proxies: dict = None, delay=DELAY, retry=RETRY, logger=None, session: Session = None): self.base_url = 'https://api.twitter.com/1.1/' self.headers = {'Authorization': token} self.logger = logger if logger is not None else Log.create_logger('TwitterSpider', './twitter.log') self.delay = delay self.session = Session(retry=retry, proxies=proxies) if session is None else session def crawl_timeline(self, screen_name: str = None, user_id: str = None, include_retweets: bool = True, exclude_replies: bool = True, start_id=None, since_id=None, delay: float = None) -> Iterable[Tweet]: """ :param screen_name: :param user_id: :param include_retweets: :param exclude_replies: :param start_id: :param since_id: :param delay: :return: """ if delay is None: delay = self.delay self.logger.info('Crawling timeline: %s', locals()) tweets = self.timeline(screen_name=screen_name, user_id=user_id, include_rts=include_retweets, exclude_replies=exclude_replies, max_id=start_id, since_id=since_id) if len(tweets) <= 0: return tweet_id = start_id for tweet in tweets: tweet_id = tweet['id'] yield Tweet(tweet) while len(tweets) > 0: sleep(delay) tweets = self.timeline(screen_name=screen_name, user_id=user_id, include_rts=include_retweets, exclude_replies=exclude_replies, max_id=tweet_id - 1, since_id=since_id) for tweet in tweets: tweet_id = tweet['id'] yield Tweet(tweet) def crawl_likes(self, screen_name: str = None, user_id: str = None, start_id=None, since_id=None, delay: float = None) -> Iterable[Tweet]: if delay is None: delay = self.delay self.logger.info('Crawling likes: %s', locals()) tweets = self.likes(screen_name=screen_name, user_id=user_id, max_id=start_id, since_id=since_id) if len(tweets) <= 0: return tweet_id = start_id for tweet in tweets: tweet_id = tweet['id'] yield Tweet(tweet) while len(tweets) > 0: sleep(delay) tweets = self.likes(screen_name=screen_name, user_id=user_id, max_id=tweet_id - 1, since_id=since_id) for tweet in tweets: tweet_id = tweet['id'] yield Tweet(tweet) def crawl_following(self, screen_name: str = None, user_id: str = None, include_retweets: bool = True, exclude_replies: bool = True, checkpoint: Checkpoint = None, delay: float = None) -> Iterable[Tweet]: if delay is None: delay = self.delay cursor = checkpoint.cursor start = checkpoint is None or checkpoint.start self.logger.info('Crawling following: %s', locals()) users = self.following(screen_name=screen_name, user_id=user_id, cursor=cursor) for user in users['users']: if not start: if checkpoint.user_id is None or user['id'] == checkpoint.user_id: start = True sleep(delay) for tweet in self.crawl_timeline(user_id=user['id'], include_retweets=include_retweets, exclude_replies=exclude_replies, start_id=checkpoint.tweet_id, delay=delay): yield tweet else: continue else: sleep(delay) for tweet in self.crawl_timeline(user_id=user['id'], include_retweets=include_retweets, exclude_replies=exclude_replies, delay=delay): yield tweet cursor = users['next_cursor'] while len(users['users']) > 0: sleep(delay) users = self.following(screen_name=screen_name, user_id=user_id, cursor=cursor) for user in users['users']: sleep(delay) for tweet in self.crawl_timeline(user_id=user['id'], include_retweets=include_retweets, exclude_replies=exclude_replies, delay=delay): yield tweet cursor = users['next_cursor'] def _get(self, url, params): """ Access API with requests and return the result with the format of json. """ r = self.session.get(url=url, params=params, headers=self.headers) return json.loads(r.text) def _url(self, url): return urlparse.urljoin(self.base_url, url) def timeline(self, user_id: str = None, screen_name: str = None, count: int = 200, exclude_replies: bool = None, include_rts: bool = None, trim_user: bool = None, since_id=None, max_id=None): """ Returns a collection of the most recent Tweets posted by the user indicated by the screen_name or user_id parameters. User timelines belonging to protected users may only be requested when the authenticated user either "owns" the timeline or is an approved follower of the owner. The timeline returned is the equivalent of the one seen as a user's profile on Twitter. This method can only return up to 3,200 of a user's most recent Tweets. Native retweets of other statuses by the user is included in this total, regardless of whether include_rts is set to false when requesting this resource. Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 900 Requests / 15-min window (app auth): 1500 Requests / 24-hour window: 100,000 Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-show for more information. :param user_id: The ID of the user for whom to return results. :param screen_name: The screen name of the user for whom to return results. :param since_id: Returns results with an ID greater than (that is, more recent than) the specified ID. There are limits to the number of Tweets that can be accessed through the API. If the limit of Tweets has occured since the since_id, the since_id will be forced to the oldest ID available. :param count: Specifies the number of Tweets to try and retrieve, up to a maximum of 200 per distinct request. The value of count is best thought of as a limit to the number of Tweets to return because suspended or deleted content is removed after the count has been applied. We include retweets in the count, even if include_rts is not supplied. It is recommended you always send include_rts=1 when using this API method. :param max_id: Returns results with an ID less than (that is, older than) or equal to the specified ID. :param trim_user: When set to either true , t or 1 , each Tweet returned in a timeline will include a user object including only the status authors numerical ID. Omit this parameter to receive the complete user object. :param exclude_replies: This parameter will prevent replies from appearing in the returned timeline. Using exclude_replies with the count parameter will mean you will receive up-to count tweets — this is because the count parameter retrieves that many Tweets before filtering out retweets and replies. :param include_rts: When set to false , the timeline will strip any native retweets (though they will still count toward both the maximal length of the timeline and the slice selected by the count parameter). Note: If you're using the trim_user parameter in conjunction with include_rts, the retweets will still contain a full user object. :return: List of tweets= objects. """ params = locals() del (params['self']) self.logger.info('Get timeline: %s', params) if user_id is None and screen_name is None: raise ValueError('User ID or username is required.') return self._get(self._url('statuses/user_timeline.json'), params) def user(self, user_id: str = None, screen_name: str = None, include_entitles: bool = None): """ Returns a variety of information about the user specified by the required user_id or screen_name parameter. The author's most recent Tweet will be returned inline when possible. You must be following a protected user to be able to see their most recent Tweet. If you don't follow a protected user, the user's Tweet will be removed. A Tweet will not always be returned in the current_status field. Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-show for more information. Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 900 Requests / 15-min window (app auth): 900 :param user_id: The ID of the user for whom to return results. Either an id or screen_name is required for this method. :param screen_name: The screen name of the user for whom to return results. Either a id or screen_name is required for this method. :param include_entitles: The entities node will not be included when set to false. :return: User-object, see https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object . """ params = locals() del (params['self']) self.logger.info('Get user: %s', params) if user_id is None and screen_name is None: raise ValueError('User ID or username is required') return self._get(self._url('user/show.json'), params) def followers(self, user_id: str = None, screen_name: str = None, cursor=None, count: int = 200, skip_status: bool = None, include_user_entitles: bool = None): """ Returns a cursored collection of user objects for users following the specified user. At this time, results are ordered with the most recent following first — however, this ordering is subject to unannounced change and eventual consistency issues. Results are given in groups of 20 users and multiple "pages" of results can be navigated through using the next_cursor value in subsequent requests. See Using cursors to navigate collections for more information. Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 15 Requests / 15-min window (app auth): 15 Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-followers-list for more information. :param user_id: The ID of the user for whom to return results. :param screen_name: The screen name of the user for whom to return results. :param cursor: Causes the results to be broken into pages. If no cursor is provided, a value of -1 will be assumed, which is the first "page." The response from the API will include a previous_cursor and next_cursor to allow paging back and forth. See Using cursors to navigate collections for more information. :param count: The number of users to return per page, up to a maximum of 200. :param skip_status: When set to either true, t or 1 statuses will not be included in the returned user objects. :param include_user_entitles: The user object entities node will not be included when set to false. :return: { "users": [ {user-object}, {user-object}, {user-object} ], "previous_cursor": 0, "previous_cursor_str": "0", "next_cursor": 1333504313713126852, "next_cursor_str": "1333504313713126852" } For user-object, see https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object . """ params = locals() del (params['self']) self.logger.info('Get followers: %s', params) if user_id is None and screen_name is None: raise ValueError('User ID or username is required') return self._get(self._url('followers/list.json'), params) def follower_ids(self, user_id: str = None, screen_name: str = None, cursor=None, count: int = 200, skip_status: bool = None, include_user_entitles: bool = None): """ Returns a cursored collection of user IDs for every user following the specified user. At this time, results are ordered with the most recent following first — however, this ordering is subject to unannounced change and eventual consistency issues. Results are given in groups of 20 users and multiple "pages" of results can be navigated through using the next_cursor value in subsequent requests. See Using cursors to navigate collections for more information. This method is especially powerful when used in conjunction with GET users / lookup, a method that allows you to convert user IDs into full user objects in bulk. Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 15 Requests / 15-min window (app auth): 15 Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-followers-ids for more information. :param user_id: The ID of the user for whom to return results. :param screen_name: The screen name of the user for whom to return results. :param cursor: Causes the results to be broken into pages. If no cursor is provided, a value of -1 will be assumed, which is the first "page." The response from the API will include a previous_cursor and next_cursor to allow paging back and forth. See Using cursors to navigate collections for more information. :param count: The number of users to return per page, up to a maximum of 200. :param skip_status: When set to either true, t or 1 statuses will not be included in the returned user objects. :param include_user_entitles: The user object entities node will not be included when set to false. :return: { "ids": [], "previous_cursor": 0, "previous_cursor_str": "0", "next_cursor": 1333504313713126852, "next_cursor_str": "1333504313713126852" } """ params = locals() del (params['self']) self.logger.info('Get follower IDs: %s', params) if user_id is None and screen_name is None: raise ValueError('User ID or username is required') return self._get(self._url('followers/ids.json'), params) def following(self, user_id: str = None, screen_name: str = None, cursor=None, count: int = 200, stringify_ids: bool = None): """ Returns a cursored collection of user IDs for every user the specified user is following (otherwise known as their "friends"). At this time, results are ordered with the most recent following first — however, this ordering is subject to unannounced change and eventual consistency issues. Results are given in groups of 20 users and multiple "pages" of results can be navigated through using the next_cursor value in subsequent requests. See Using cursors to navigate collections for more information. Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 15 Requests / 15-min window (app auth): 15 Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-friends-list for more information. :param user_id: The ID of the user for whom to return results. :param screen_name: The screen name of the user for whom to return results. :param cursor: Causes the results to be broken into pages. If no cursor is provided, a value of -1 will be assumed, which is the first "page." The response from the API will include a previous_cursor and next_cursor to allow paging back and forth. See Using cursors to navigate collections for more information. :param count: The number of users to return per page, up to a maximum of 200. Defaults to 20. :param stringify_ids: :return: { "users": [ {user-object}, {user-object}, {user-object} ], "previous_cursor": 0, "previous_cursor_str": "0", "next_cursor": 1333504313713126852, "next_cursor_str": "1333504313713126852" } For user-object, see https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object . """ params = locals() del (params['self']) self.logger.info('Get followings: %s', params) if user_id is None and screen_name is None: raise ValueError('User ID or username is required') return self._get(self._url('friends/list.json'), params) def following_ids(self, user_id: str = None, screen_name: str = None, cursor=None, count: int = 200, stringify_ids: bool = None): """ Returns a cursored collection of user IDs for every user the specified user is following (otherwise known as their "friends"). At this time, results are ordered with the most recent following first — however, this ordering is subject to unannounced change and eventual consistency issues. Results are given in groups of 20 users and multiple "pages" of results can be navigated through using the next_cursor value in subsequent requests. See Using cursors to navigate collections for more information. This method is especially powerful when used in conjunction with GET users / lookup, a method that allows you to convert user IDs into full user objects in bulk. Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-friends-ids for more information. Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 15 Requests / 15-min window (app auth): 15 :param user_id: The ID of the user for whom to return results. :param screen_name: The screen name of the user for whom to return results. :param cursor: Causes the results to be broken into pages. If no cursor is provided, a value of -1 will be assumed, which is the first "page." The response from the API will include a previous_cursor and next_cursor to allow paging back and forth. See Using cursors to navigate collections for more information. :param count: The number of users to return per page, up to a maximum of 200. Defaults to 20. :param stringify_ids: :return: { "ids": [], "previous_cursor": 0, "previous_cursor_str": "0", "next_cursor": 1333504313713126852, "next_cursor_str": "1333504313713126852" } """ params = locals() del (params['self']) self.logger.info('Get following IDs: %s', params) if user_id is None and screen_name is None: raise ValueError('User ID or username is required') return self._get(self._url('friends/ids.json'), params) def likes(self, user_id: str = None, screen_name: str = None, count: int = 200, since_id=None, max_id=None, include_entitles: bool = None): """ Note: favorites are now known as likes. Returns the 20 most recent Tweets liked by the authenticating or specified user. Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 75 Requests / 15-min window (app auth): 75 Check https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-favorites-list for more information. :param user_id: The ID of the user for whom to return results. :param screen_name: The screen name of the user for whom to return results. :param count: Specifies the number of records to retrieve. Must be less than or equal to 200; defaults to 20. The value of count is best thought of as a limit to the number of Tweets to return because suspended or deleted content is removed after the count has been applied. :param since_id: Returns results with an ID greater than (that is, more recent than) the specified ID. There are limits to the number of Tweets which can be accessed through the API. If the limit of Tweets has occured since the since_id, the since_id will be forced to the oldest ID available. :param max_id: Returns results with an ID less than (that is, older than) or equal to the specified ID. :param include_entitles: The entities node will be omitted when set to false. :return: List of tweet objects. """ params = locals() del (params['self']) self.logger.info('Get likes: %s', params) if user_id is None and screen_name is None: raise ValueError('User ID or username is required') return self._get(self._url('favorites/list.json'), params) def tweet(self, tweet_id: str, trim_user: bool = None, include_my_retweet: bool = None, include_entitles: bool = None, include_ext_alt_text: bool = None, include_card_uri: bool = None): """ Returns a single Tweet, specified by the id parameter. The Tweet's author will also be embedded within the Tweet. See GET statuses / lookup for getting Tweets in bulk (up to 100 per call). See also Embedded Timelines, Embedded Tweets, and GET statuses/oembed for tools to render Tweets according to Display Requirements. About Geo If there is no geotag for a status, then there will be an empty <geo></geo> or "geo" : {}. This can only be populated if the user has used the Geotagging API to send a statuses/update. The JSON response mostly uses conventions laid out in GeoJSON. The coordinates that Twitter renders are reversed from the GeoJSON specification (GeoJSON specifies a longitude then a latitude, whereas Twitter represents it as a latitude then a longitude), eg: "geo": { "type":"Point", "coordinates":[37.78029, -122.39697] } Response formats: JSON Requires authentication? Yes Rate limited? Yes Requests / 15-min window (user auth): 900 Requests / 15-min window (app auth): 900 :param tweet_id: The numerical ID of the desired Tweet. :param trim_user: When set to either true , t or 1 , each Tweet returned in a timeline will include a user object including only the status authors numerical ID. Omit this parameter to receive the complete user object. :param include_my_retweet: When set to either true , t or 1 , any Tweets returned that have been retweeted by the authenticating user will include an additional current_user_retweet node, containing the ID of the source status for the retweet. :param include_entitles: The entities node will not be included when set to false. :param include_ext_alt_text: If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null. :param include_card_uri: When set to either true , t or 1 , the retrieved Tweet will include a card_uri attribute when there is an ads card attached to the Tweet and when that card was attached using the card_uri value. :return: The tweet object. """ params = locals() del (params['self']) self.logger.info('Get tweet: %s', params) if tweet_id is None: raise ValueError('Tweet ID is required') return self._get(self._url('statuses/show.json'), params)