def get_attribute_from_html(self, data): result = {} tag_name = [] for key, value in data.items(): if "url" != key: pars = BeautifulSoup(value, "html.parser").find() if pars.get("id", False): parsed_data = "#" + pars.get("id") result[key] = parsed_data elif pars.get("src", False): parsed_data = "@" + pars.get("src") result[key] = parsed_data + '|' + pars.name elif pars.get("class", False): parsed_data = "." + " ".join(pars.get("class")) result[key] = parsed_data elif pars.get('itemprop', False): parsed_data = "&" + pars.get("itemprop") result[key] = parsed_data + "|" + pars.name elif pars.get('href', False): parsed_data = "^" + pars.get("href") result[key] = parsed_data + "|" + pars.name else: tag_name.append(key) else: result[key] = value if tag_name: return [result, tag_name] return result
def repetitive_rate_by_playlistlink(link1, link2): #找到歌单中所有歌组成字符串数组 lists1 = [] lists2 = [] #linklist=[link1,link2] #for url in linklist: # s = requests.session() # s = BeautifulSoup(s.get(url, headers=headers).content, 'lxml') # main = s.find('ul', {'class': 'f-hide'}) # for music in main.find_all('a'): # lists.append(music.text) s1 = requests.session() s1 = BeautifulSoup(s1.get(link1, headers=headers).content, 'lxml') main = s1.find('ul', {'class': 'f-hide'}) for music in main.find_all('a'): lists1.append(music.text) s2 = requests.session() s2 = BeautifulSoup(s2.get(link2, headers=headers).content, 'lxml') main = s2.find('ul', {'class': 'f-hide'}) for music in main.find_all('a'): lists2.append(music.text) myset1 = set(lists1) myset2 = set(lists2) pattern = re.compile('\Wu\'') intersectionset = re.sub(pattern, '<br>\'', str(myset1 & myset2)) length = len(myset1 | myset2) + len(myset1 & myset2) print intersectionset return (u"你们的歌单重合率为:%f%%<br><br>重复歌曲共%d首如下:%s" % (len(myset1 & myset2) * 200 / length, len(myset1 & myset2), intersectionset.decode('unicode-escape')))
def evaluateDiv(contentJson): contentBase = BeautifulSoup(contentJson, "lxml") divs = contentBase.find_all('div') totalQuantity = len(divs) failed = 0 badTags = [] good = 0 flag = 0 numId = -1 for div in divs: numId += 1 divRole = div.get('role') divAria = div.get('aria-level') nextAux = div.findNext() divNested = BeautifulSoup(str(nextAux), 'lxml').find('div') divParent = div.parent if (divAria): #encabezados ARIA anidados - 100% parentAria = divParent.get('aria-level') parentRole = divParent.get('role') if (divParent.name == "div" and parentAria and parentRole == "heading"): flag = 1 typeError = 1 #encabezados ARIA anidados - 100% elif (divNested): divNestedRole = divNested.get('role') divNestedAria = divNested.get('aria-level') if (divNestedAria and (divNestedRole == "heading")): flag = 1 typeError = 1 #ARIA role=heading -25% elif (divRole and divRole == "heading" and not divNested): good += 1 else: good += 1 if (flag == 1): failed += 1 pos = position(contentBase, div) pos.append(numId) pos.append(typeError) badTags.append(pos) flag = 0 dataResponse = { 'tag': 'div', 'totalTagsAnalyzed': totalQuantity, 'totalTagsFailed': failed, 'positionsBadTags': badTags } response = json.dumps({'status': 'success', 'data': dataResponse}) return response
def search_id_by_username(self, username): """通过用户昵称查找用户 ID。 :param str username: 用户昵称 :return int: 用户 ID """ if not username: cprint('Must give an <user id> or <username>!', 'yellow') sys.exit(1) search_url = urljoin(HOST_PAGE, SEARCH_DESIGNER_SUFFIX.format(word=username)) try: response = session_request(search_url) except requests.exceptions.ProxyError: cprint('Cannot connect to proxy.', 'red') sys.exit(1) except Exception as e: cprint(f'Failed to connect to {search_url}, {e}', 'red') sys.exit(1) author_1st = BeautifulSoup(response.text, 'html.parser').find(name='div', class_='author-info') if (not author_1st) or (author_1st.get('data-name') != username): cprint(f'Username「{username}」does not exist!', 'yellow') sys.exit(1) return author_1st.get('data-id')
def encodeScript(line): sc = BeautifulSoup(line, "html.parser").find("script") if(sc.get("src")): sc["src"] = encodeBase64(sc.get("src")) else: sc.string = pattern.sub( lambda x: repr(encodeBase64(x.group(2), dirname)), sc.string) return sc.prettify()
def process_heading(heading: BeautifulSoup, textf: str, is_toplevel: bool, single_file: bool) -> TocItem: """ Generate and return a TocItem from this heading. INPUTS: heading: a BeautifulSoup tag representing a heading tag text: the path to the file is_toplevel: is this heading at the top-most level in the file? single_file: is there only one content file in the production (like some Poetry volumes)? OUTPUTS: a qualified ToCItem object """ toc_item = TocItem() parent_sections = heading.find_parents(["section", "article"]) if parent_sections: toc_item.level = len(parent_sections) else: toc_item.level = 1 try: toc_item.division = get_book_division(heading) except se.InvalidInputException: raise se.InvalidInputException( f"Couldn’t identify parent section in file: [path][link=file://{textf}]{textf}[/][/]." ) # This stops the first heading in a file getting an anchor id, we don't generally want that. # The exceptions are things like poems within a single-file volume. toc_item.id = get_parent_id(heading) # pylint: disable=invalid-name if toc_item.id == "": toc_item.file_link = textf else: if not is_toplevel: toc_item.file_link = f"{textf}#{toc_item.id}" elif single_file: # It IS the first heading in the file, but there's only a single content file? toc_item.file_link = f"{textf}#{toc_item.id}" else: toc_item.file_link = textf toc_item.lang = heading.get("xml:lang") or "" # A heading may include z3998:roman directly, # eg <h5 epub:type="title z3998:roman">II</h5>. attribs = heading.get("epub:type") or "" if "z3998:roman" in attribs: toc_item.roman = extract_strings(heading) toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>" return toc_item process_heading_contents(heading, toc_item) return toc_item
def rawToFields(cls, raw={}): fields = super(SkypeLocationMsg, cls).rawToFields(raw) locTag = BeautifulSoup(raw.get("content"), "html.parser").find("location") # Exponent notation produces a float, meaning lat/long will always be floats too. fields.update({"latitude": int(locTag.get("latitude")) / 1e6, "longitude": int(locTag.get("longitude")) / 1e6, "altitude": int(locTag.get("altitude")), "address": locTag.get("address"), "mapUrl": locTag.find("a").get("href")}) return fields
def evaluateLabel(contentJson): contentBase= BeautifulSoup(contentJson, "lxml") labels = contentBase.find_all('label') totalQuantity = len(labels) failed = 0 good = 0 badTags = [] flag = 0 numId = -1 for label in labels: numId += 1 labelNext = label.findNext() nextInput = BeautifulSoup(str(labelNext), 'lxml').find('input') labelAttrFor = label.get('for') labelPlaceholder = label.text if (labelPlaceholder): labelPlaceholder = re.sub(r"[ \t\n\x0B\f\r]", '', labelPlaceholder, flags=0) #*INPUT type=text inside blank LABEL - 100% if (not labelPlaceholder and nextInput): inputType = nextInput.get('type') if (inputType == "text"): flag = 1 typeError = 1 #*INPUT type=text con blank LABEL FOR - 100% elif (labelAttrFor and nextInput): inputType = nextInput.get('type') if (inputType == "text"): flag = 1 typeError = 1 else: good += 1 if (flag == 1): failed += 1 pos = position(contentBase, label) pos.append(numId) pos.append(typeError) badTags.append(pos) flag = 0 dataResponse ={ 'tag' : 'label', 'totalTagsAnalyzed': totalQuantity, 'totalTagsFailed': failed, 'positionsBadTags': badTags } response = json.dumps({'status': 'success', 'data': dataResponse}) return response
def findPages(uri,uris): # Finds recursively the lings to each page import requests from bs4 import BeautifulSoup link = BeautifulSoup(requests.get(uri).text,'html.parser').find('a',text='Next') if link != None: uris.append('https://www.washingtonpost.com'+link.get('href')) findPages('https://www.washingtonpost.com'+link.get('href'),uris) return uris else: return []
def get_entry_img_url(cls, element: BeautifulSoup): if element.img is not None: element = element.img img_url = element.get("data-src", None) if img_url is None: img_url = element.get("src", None) if img_url is None: return None return cls.abs_url(img_url)
def chrollPostList(self, postListUrl): self.driver.movePage(postListUrl) postList = PostList(postListUrl) soup = BeautifulSoup(self.driver.getPageSource(), 'html.parser') trList = soup.find_all('tr', attrs={'class': 'ub-content us-post'}) soup.get('') for trElement in trList: if len(trElement.find_all('b')) > 0: continue postList.addPost(self._initPost(trElement)) self.logger.print('[done] ' + postListUrl + ' 게시글 목록 chrolling 완료') return postList
def rawToFields(cls, raw={}): fields = super(SkypeFileMsg, cls).rawToFields(raw) # BeautifulSoup converts tag names to lower case, and find() is case-sensitive. file = BeautifulSoup(raw.get("content"), "html.parser").find("uriobject") if file: fileFields = {"name": (file.find("originalname") or {}).get("v"), "size": (file.find("filesize") or {}).get("v"), "urlFull": file.get("uri"), "urlThumb": file.get("url_thumbnail"), "urlView": (file.find("a") or {}).get("href")} fields["file"] = SkypeFileMsg.File(**fileFields) return fields
def get_tags(article: BeautifulSoup) -> List[Optional[str]]: """ Get tags from an article. Args: article (str) : Part of . """ article_tags = ( article.get("data-tags").split(",") if article.get("data-tags") else [] ) article_tags = map(str.strip, article_tags) return [_ for _ in article_tags]
def rawToFields(cls, raw={}): fields = super(SkypeLocationMsg, cls).rawToFields(raw) locTag = BeautifulSoup(raw.get("content"), "html.parser").find("location") for attr in ("latitude", "longitude", "altitude", "speed", "course"): fields[attr] = int(locTag.get(attr)) if locTag.get(attr) else None # Exponent notation produces a float, meaning lat/long will always be floats too. for attr in ("latitude", "longitude"): if fields[attr]: fields[attr] /= 1e6 fields.update({"address": locTag.get("address"), "mapUrl": locTag.find("a").get("href")}) return fields
def should_skip_element(element: BeautifulSoup) -> bool: attributes = element.get_attribute_list('class') if element.get('data-no-tax-price') is None: return True if 'hidden' in attributes: return True if not is_number(element.get('data-no-tax-price')): return True return False
def get_guanzhu(self, url): count = 1 response = self.session.get(url) pages = re.search(r'(<div class=\\"W_pages\\">[\s\S]*?<\\/div>)', response.text).group(1) soup = BeautifulSoup(pages, 'lxml').find_all('a')[-1] next_page = None if '下一页' in str(soup): next_page = 'https://weibo.com' + soup.get('href').replace( '\\"', '').replace('\\', '') uids = [] guanzhu_li = re.findall( r'(<div class=\\"info_name[\s\S]*?>[\s\S]*?<\\/div>)', response.text) for i in guanzhu_li: soup = BeautifulSoup(i, 'lxml') a = soup.find('a', attrs={'class': '\\"S_txt1\\"'}) userid = a.get('usercard') if userid: uids.append(re.search(r'id=(\d+)&', userid).group(1)) else: uids.append(None) while next_page: count += 1 # 新浪规定非互相关注的话只能看前五页关注列表 if count > 5: break response = self.session.get(next_page) pages = re.search(r'(<div class=\\"W_pages\\">[\s\S]*?<\\/div>)', response.text).group(1) soup = BeautifulSoup(pages, 'lxml').find_all('a')[-1] next_page = None if '下一页' in str(soup) and soup.get('href'): next_page = 'https://weibo.com' + soup.get('href').replace( '\\"', '').replace('\\', '') guanzhu_li = re.findall( r'(<div class=\\"info_name[\s\S]*?>[\s\S]*?<\\/div>)', response.text) for i in guanzhu_li: soup = BeautifulSoup(i, 'lxml') a = soup.find('a', attrs={'class': '\\"S_txt1\\"'}) userid = a.get('usercard') if userid: uids.append(re.search(r'id=(\d+)&', userid).group(1)) else: uids.append(None) print(len(uids)) return uids
def rawToFields(cls, raw={}): fields = super(SkypeLocationMsg, cls).rawToFields(raw) locTag = BeautifulSoup(raw.get("content"), "html.parser").find("location") fields.update({ # Exponent notation produces a float, meaning lat/long will always be floats too. "latitude": int(locTag.get("latitude")) / 1e6, "longitude": int(locTag.get("longitude")) / 1e6, "altitude": int(locTag.get("altitude")), "address": locTag.get("address"), "mapUrl": locTag.find("a").get("href") }) return fields
def re_the_src(img_src): ''' return the src.sessions of the string ''' # print(img_src) this_img = BeautifulSoup(img_src, features="lxml").find('img') src = str(this_img.get('src')) height = str(this_img.get('height')) width = str(this_img.get('width')) # print(src) b64_data = src.split("base64,")[1] image_type = src.split("base64,")[0].split("/")[1][:-1] return b64_data, image_type, height, width
def get_session(url): if "bestbuy.com" in url: # beautifulsoup is faster and headless so I prefer this method for compatible websites with open('headers.json') as file: data = json.load(file) header = data[choice(list(data.keys()))] # random header picker -- originally used as a clever way to prevent amazon blocking scrape, but that no longer works so I switched to Selenium r = requests.Session() response = r.get(url, headers = header) page = BeautifulSoup(response.content, 'html.parser') else: # selenium is used for target and amazon PATH = os.getcwd() + driver page = webdriver.Chrome(PATH) page.get(url) return page
def get(self, parameter: JobField, soup: BeautifulSoup) -> Any: """Get a single job attribute from a soup object by JobField TODO: impl div class=compactStars value somewhere. """ if parameter == JobField.TITLE: # TODO: we should instead get what user sees in the <span> return soup.get('data-normalize-job-title') elif parameter == JobField.COMPANY: return soup.find( 'div', attrs={'class', 'jobInfoItem jobEmpolyerName'} ).text.strip() elif parameter == JobField.LOCATION: return soup.get('data-job-loc') # FIXME: impl. # elif parameter == JobField.TAGS: # labels = soup.find_all('div', attrs={'class', 'jobLabel'}) # if labels: # return [ # l.text.strip() for l in labels if l.text.strip() != 'New' # ] # else: # return [] # FIXME: impl JobField.REMOTE elif parameter == JobField.POST_DATE: return calc_post_date_from_relative_str( soup.find( 'div', attrs={ 'class': 'd-flex align-items-end pl-std css-mi55ob' } ).text.strip() ) elif parameter == JobField.WAGE: # NOTE: most jobs don't have this so we wont raise a warning here # and will fail silently instead wage = soup.find('span', attrs={'class': 'gray salary'}) if wage is not None: return wage.text.strip() else: return '' elif parameter == JobField.KEY_ID: return soup.get('data-id') elif parameter == JobField.URL: part_url = soup.find( 'div', attrs={'class', 'logoWrap'} ).find('a').get('href') return ( f'https://www.glassdoor.{self.config.search_config.domain}' f'{part_url}' ) else: raise NotImplementedError(f"Cannot get {parameter.name}")
def getFirstTitle(): response = requests.get( 'https://testerhome.com/topics/last') #拿到网页地址给response response.encoding = 'utf-8' #找到标题对应的a标签 a = BeautifulSoup(response.text, 'lxml') \ .find("div", class_="panel-body item-list") \ .find("div", class_="title media-heading") \ .find('a') #创建一个空字典,用于存放title和href firstContent = {} firstContent['title'] = a.get('title') firstContent['href'] = a.get('href') #返回两个值1 return firstContent
def get_voters(self): """ 回答赞同者 return: 回答赞同者名字和主页地址 rtype: (username, url) Iterable """ get_url = None while True: if get_url == "": break voter_soup = self.get_voter_page(get_url) get_url = ZHI_HU_URL + voter_soup.json()['paging']['next'] for item in voter_soup.json()['payload']: soup = BeautifulSoup(item, "lxml").find('a') yield (soup.get('title'), ZHI_HU_URL + soup.get('href'))
def __init__(self): super(GoogleSignIn, self).__init__('google') # googleinfo = urllib2.urlopen('https://accounts.google.com/.well-known/openid-configuration') # google_params = json.load(googleinfo) res = requests.get( 'https://accounts.google.com/.well-known/openid-configuration') google_params = BeautifulSoup(res.text, "html.parser") google_params = json.loads(google_params.text) self.service = OAuth2Service( name='google', client_id=self.consumer_id, client_secret=self.consumer_secret, authorize_url=google_params.get('authorization_endpoint'), base_url=google_params.get('userinfo_endpoint'), access_token_url=google_params.get('token_endpoint'))
def getView(self, url): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "jw.hzau.edu.cn", "Pragma": "no-cache", "Referer": "http://jw.hzau.edu.cn/xs_main.aspx?xh=2013307201006", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36", } viewstate = None req = urllib.request.Request(url, headers=headers) while viewstate is None: try: html = urllib.request.urlopen(req, timeout=3).read().decode("gbk") soup = BeautifulSoup(html, "html.parser") try: soup = soup.find_all(attrs={"name": "__VIEWSTATE"})[0] viewstate = soup.get("value") except: print("get response but cant get VIEWSTATE") continue except: print("get available course table has not accepted response") continue return viewstate
def videos(url): response = get_html(url) if response == False: xbmcgui.Dialog().notification(name, 'No Episodes Available', defaultimage, 5000, False) sys.exit() xbmc.log('PAGE NOT FOUND') soup = BeautifulSoup(response,'html.parser').find_all('div',{'class': 'cattombstone'}) xbmc.log('SOUP: ' + str(len(soup))) nxt = BeautifulSoup(response,'html.parser').find_all('a',{'class': 'nextpostslink'}) xbmc.log('NEXT: ' + str(len(nxt))) if len(nxt) > 0: nxt = nxt[0] nurl = nxt.get('href') xbmc.log('NURL: ' + str(nurl)) for item in soup: title = striphtml(str(item.find('a'))) thumbnail = item.find('img')['src'] url = item.find('a')['href'] purl = 'plugin://plugin.video.bnwmovies?mode=637&url=' + url + "&name=" + urllib.quote_plus(title) + "&iconimage=" + urllib.quote_plus(thumbnail) li = xbmcgui.ListItem(title, iconImage=thumbnail, thumbnailImage=thumbnail) li.setProperty('fanart_image', defaultfanart) li.addContextMenuItems([('Download File', 'XBMC.RunPlugin(%s?mode=80&url=%s)' % (sys.argv[0], url)),('Plot Info', 'XBMC.RunPlugin(%s?mode=81&url=%s)' % (sys.argv[0], url))]) xbmcplugin.addDirectoryItem(handle=addon_handle, url=purl, listitem=li) xbmcplugin.setContent(addon_handle, 'movies') if len(nxt) > 0: add_directory2('Next Page',nurl,636,defaultfanart,defaultimage,plot='Next Page') xbmcplugin.endOfDirectory(addon_handle)
def getView(self,url): headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'no-cache', 'Connection':'keep-alive', 'Host':'jw.hzau.edu.cn', 'Pragma':'no-cache', 'Referer':'http://jw.hzau.edu.cn/xs_main.aspx?xh=2013307201006', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' } viewstate=None req=urllib.request.Request(url,headers=headers) while viewstate is None: try: html=urllib.request.urlopen(req, timeout=3).read().decode('gbk') soup=BeautifulSoup(html,'html.parser') try: soup=soup.find_all(attrs={'name':'__VIEWSTATE'})[0] viewstate=soup.get('value') except: print('get response but cant get VIEWSTATE') continue except: print('get schedule has not accepted response') continue return viewstate
def get_playlist(self, playlist_id): self.view_capture(int(playlist_id)) url = default.playlist_api.format(playlist_id) s = requests.session() s = BeautifulSoup( s.get(url, headers=self.__headers).content, "html.parser") playlist = json.loads(s.text)['result'] print("《" + playlist['name'].encode('utf-8') + "》") author = playlist['creator']['nickname'].encode('utf-8') pc = str(playlist['playCount']) sc = str(playlist['subscribedCount']) rc = str(playlist['shareCount']) cc = str(playlist['commentCount']) print("维护者:{} 播放:{} 关注:{} 分享:{} 评论:{}".format(author, pc, sc, rc, cc)) print("描述:{}".format(playlist['description'].encode('utf-8'))) print("标签:{}".format(",".join(playlist['tags']).encode("utf-8"))) tb = [["ID", "歌曲名字", "艺术家", "唱片"]] for music in playlist['tracks']: artists = [] for s in music['artists']: artists.append(s['name']) ms = music['name'].encode("utf-8") ar = ",".join(artists).encode("utf-8") ab = music['album']['name'].encode("utf-8") id = music['id'] tb.append([id, ms, ar, ab]) print(AsciiTable(tb).table)
def sendCreds(self, user, pwd, params): # Now pass the login credentials over. loginResp = self.conn( "POST", "{0}/ppsecure/post.srf".format(SkypeConnection.API_MSACC), params={ "wa": "wsignin1.0", "wp": "MBI_SSL", "wreply": "https://lw.skype.com/login/oauth/proxy?client_id=578134&site_name=" "lw.skype.com&redirect_uri=https%3A%2F%2Fweb.skype.com%2F" }, cookies={ "MSPRequ": params["MSPRequ"], "MSPOK": params["MSPOK"], "CkTst": str(int(time.time() * 1000)) }, data={ "login": user, "passwd": pwd, "PPFT": params["PPFT"] }) tField = BeautifulSoup(loginResp.text, "html.parser").find(id="t") if tField is None: err = re.search(r"sErrTxt:'([^'\\]*(\\.[^'\\]*)*)'", loginResp.text) errMsg = "Couldn't retrieve t field from login response" if err: errMsg = re.sub(r"<.*?>", "", err.group(1)).replace( "\\'", "'").replace("\\\\", "\\") raise SkypeAuthException(errMsg, loginResp) return tField.get("value")
def view_capture(self, page, type="全部"): s = requests.session() play_url = self.__play_url.format(type, page * 35) try: acmsk = {'class': 'msk'} scnb = {'class': 'nb'} dcu = {'class': 'u-cover u-cover-1'} ucm = {'class': 'm-cvrlst f-cb'} s = BeautifulSoup( s.get(play_url, headers=self.__headers).content, "html.parser") lst = s.find('ul', ucm) for play in lst.find_all('div', dcu): title = play.find('a', acmsk)['title'].encode('utf-8') link = play.find('a', acmsk)['href'].encode('utf-8').replace( "/playlist?id=", "") cnt = play.find('span', scnb).text.encode('utf-8').replace( '万', '0000') if pysql.single("playlist163", "link", link) is True: pl = pysql.Playlist163(title=title, link=link, cnt=int(cnt), dsc="曲风:{}".format(type)) self.session.add(pl) self.session.commit() except Exception as e: pylog.log.error("抓取歌单出现问题:{} 歌单类型:{} 页码:{}".format(e, type, page)) raise
def _extract_upload_policies_asset_authenticity_token( self, html: str, ) -> Optional[str]: file_attachment_tag = BeautifulSoup( html, features='lxml', ).find('file-attachment', ) if not file_attachment_tag: return None authenticity_token = file_attachment_tag.get( 'data-upload-policy-authenticity-token', None) if authenticity_token is None: csrf_policy_input = file_attachment_tag.find( 'input', { 'class': 'js-data-upload-policy-url-csrf', }, ) if csrf_policy_input is not None: authenticity_token = csrf_policy_input.get('value', None) return cast(str, authenticity_token)
def get_news(link): length = len(user_agents) index=random.randint(0,length-1) user_agent = user_agents[index] headers={ 'Referer': 'http://www.jrzj.com', 'Host':'www.jrzj.com', 'User-Agent':user_agent, 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } bsObj=requests.session() bsObj=BeautifulSoup(bsObj.get(link,headers=headers).content,'html.parser') title=bsObj.h1.get_text() #print('标题:',title) tags_list=bsObj.find('meta',{'name':'keywords'}).attrs['content'] l=re.split(',',tags_list) tags=[item for item in filter(lambda x:x != '', l)] #print('标签:',tags) category=bsObj.title.get_text().split('_')[1] #print('分类',category) #content=bsObj.find('div',{'class':'news_content'}).prettify() content=bsObj.find('div',{'class':'news_content'}) #print('内容:',content) #查找图片 a_tag=content.find('img') #print(a_tag) image_url=a_tag.attrs['src'] image_name=os.path.basename(image_url).split('!')[0] #下载图片 get_image(image_url,image_name) #删除标签 a_tag.extract() news=News(title,tags,category,content.prettify(),image_name) return news
def get_music(Songlistid, id, tables): songslist_id = id url = 'https://music.163.com/playlist?id=' + str(songslist_id) insert_list = list() s = requests.session() s = BeautifulSoup(s.get(url, headers=header).content, 'lxml') id_list = s.select('ul.f-hide li a') #查詢歌單數據表 joe = SongsList.objects.get(id=Songlistid) i = 1 previous_songs_num = len(Songs.objects.all()) for songs in id_list: name = songs.text id = songs.get('href')[9:] url = 'http://music.163.com/song/media/outer/url?id=' + id insert_list.append(tables(song_id=id, name=name, url=url)) i += 1 tables.objects.bulk_create(insert_list) print(f"成功创建{i}个数据表") SONGS = Songs.objects.all()[previous_songs_num:] for song in SONGS: song.songslist.add(joe) print(f"{tables}{Songlistid}数据更新完毕")
def fetch_individual_page(self, isbn: str) -> BeautifulSoup: """入力されたISBNから個別の書籍ベージのhtmlデータを取得する""" # 検索する params = { "detailFlg": 1, "isbn": isbn, "seldt": r"2023%2Fall%2Fall%2Fbefore", "srchf": 1, "store": 1 } # 検索結果から個別ページへいく soup = BeautifulSoup(self._fetch_html(self.url, params=params), features="html.parser") dytitle = soup.find("a", class_="dyTitle") if dytitle is None: dytitle = BeautifulSoup(self._fetch_html(self.extended_url, params=params), features="html.parser").find("a", class_="dyTitle") # 電子書籍のみでヒットしなければ紙も検索に含める if dytitle is None: #それでもヒットしなければ raise HontoDoesNotHaveDataError(f"Honto not have the book data. {isbn=}") individual_page_url = dytitle.get("href") # 複数hitは先頭のものを抽出 soup = BeautifulSoup(self._fetch_html(individual_page_url), features="html.parser") return soup
def view_capture(self, link): self.session.query(pysql.Playlist163).filter( pysql.Playlist163.link == link).update({'over': 'Y'}) url = self.__url + str(link) s = requests.session() try: s = BeautifulSoup( s.get(url, headers=self.__headers).content, "html.parser") musics = json.loads(s.text)['result']['tracks'] exist = 0 for music in musics: name = music['name'].encode('utf-8') author = music['artists'][0]['name'].encode('utf-8') if pysql.single("music163", "song_id", (music['id'])) == True: self.session.add( pysql.Music163(song_id=music['id'], song_name=name, author=author)) self.session.commit() exist = exist + 1 else: pylog.log.info('{} : {} {}'.format("重复抓取歌曲", name, "取消持久化")) print("歌单包含歌曲 {} 首,数据库 merge 歌曲 {} 首 \r\n".format( len(musics), exist)) except Exception: pylog.log.error('{} : {}'.format("抓取歌单页面存在问题", url)) raise
def get_mryb_text(): #获取每日一报的最新文章链接 newAticle = requests.get("http://weixin.sogou.com/weixin?query=每日读报时间") newAticle_aflag = BeautifulSoup( newAticle.text, "html.parser").find(uigs="account_article_0") mryb_link = newAticle_aflag.get('href') #获取每日一报最新文章的网页代码 mryb = requests.get(mryb_link) aticle_soup = BeautifulSoup(mryb.text, "html.parser") #网页抓取每日一报的标题 aticle_title = aticle_soup.find(id="activity-name") #网页抓取每日一报的内容 aticle_content = aticle_soup.find("div", id="js_content") #可以使用lstrip(), strip(), rstrip()来去除某个字符,默认空格 text = '' #网页标题 for string in aticle_title.stripped_strings: text += string text += '\n' #网页内容 for string in aticle_content.stripped_strings: text += string text += '\n' return text
def parse_element_message(driver, element_message, id_user_write, id_user_read): html = element_message.get_attribute('outerHTML') bs_obj = BeautifulSoup(html, 'html.parser') message_type = HelperMessage.get_message_type(driver, bs_obj) # print(message_type) # message_source = # class, 'message-out' if message_type == 'message_text': message = HelperMessage.parse_message_text(driver, bs_obj) elif message_type == 'message_file': message = HelperMessage.parse_message_file(driver, bs_obj) elif message_type == 'message_audio': message = HelperMessage.parse_message_audio(driver, bs_obj) elif message_type == 'message_image': message = HelperMessage.parse_message_image(driver, bs_obj) elif message_type == 'message_video': message = HelperMessage.parse_message_video(driver, bs_obj) else: raise RuntimeError('unknown message type') # filling in the blanks if not 'id_user' in message: # print('add id_user') if 'message-out' in bs_obj.get('class'): message['id_user'] = id_user_write else: message['id_user'] = id_user_read if not 'timestamp' in message: message['timestamp'] = -1 return message
def item_enclosure_url(self, item): """Returns an image for enclosure""" if item.image: url = item.image.url else: img = BeautifulSoup(item.html_content).find('img') url = img.get('src') if img else None return urljoin(self.site_url, url) if url else None
def item_enclosure_url(self, item): """Returns an image for enclosure""" if item.image: url = item.image.url else: img = BeautifulSoup(item.html_content).find("img") url = img.get("src") if img else None self.cached_enclosure_url = url return urljoin(self.site_url, url) if url else None
def getAvatarByUrl(url): try: res = getContentByUrl(url) soup = BeautifulSoup(res) soup = soup.find('div', attrs={'class': 'avatar'}).find('img') url = 'http://www.oiegg.com/' + soup.get('src') return url except: return 'http://www.oiegg.com/images/avatars/noavatar.gif'
def wrap(html): soup = BeautifulSoup(html).find('div', attrs={"node-type": "feed_merge_list_item"}) weiboId = soup.get('mid') contentNode = soup.find('span', attrs={"node-type": "feed_list_forwardContentAgg"}) userName = contentNode.get('nick-name') content = contentNode.get_text() return Forward(weiboId, userName, content)
def item_enclosure_url(self, item): """ Return an image for enclosure. """ if item.image: url = item.image.url else: img = BeautifulSoup(item.html_content, 'html.parser').find('img') url = img.get('src') if img else None self.cached_enclosure_url = url return urljoin(self.site_url, url) if url else None
def _save_details(self, details): self.downloads = int(details[0].split()[0]) self.views = int(details[1].split()[0]) link = Soup(details[4]).find('a').get('href') self.link = str(link) tag = Soup(details[5]).find('a') teacher_name = str(tag.text) teacher_page = str(tag.get('href')) self.teacher = Teacher(name=teacher_name, page=teacher_page)
def _populate_latest(self): """ Popular version data for the latest release available for download """ if self.license is None: self.log.debug('No license specified, not retrieving latest version information') return # Submit a request to the client area response = self.session.get(self.license.license_url) self.log.debug('Response code: %s', response.status_code) response.raise_for_status() # Load our license page soup = BeautifulSoup(response.text, "html.parser") script_tpl = soup.find('script', id='download_form') form = BeautifulSoup(script_tpl.text, "html.parser").find('form') # Parse the response for a download link to the latest IPS release version = Version(form.find('label', {'for': 'version_latest'}).text) self.log.info('Latest IPS version: %s', version.vstring) url = form.get('action') # Parse the response for a download link to the latest development release try: dev_version = Version(form.find('label', {'for': 'version_dev'}).text) if dev_version: self.log.info('Latest IPS development version: %s', version.vstring) dev_url = form.get('action') self.dev_version = IpsMeta(self, dev_version, request=('post', dev_url, {'version': 'latestdev'}), dev=True) except AttributeError: self.log.info('No development release available for download') # If we have a cache for this version, just add our url to it if version.vtuple in self.versions: self.log.debug('Latest IPS version already downloaded, applying URL to cache entry') self.versions[version.vtuple].request = ('post', url, {'version': 'latest'}) return self.versions[version.vtuple] = IpsMeta(self, version, request=('post', url, {'version': 'latest'}))
def sendToken(self, token): # Send the existing token over. loginResp = self.conn("GET", "{0}/login".format(SkypeConnection.API_LOGIN), params={"client_id": "578134", "redirect_uri": "https://web.skype.com"}, cookies={"refresh-token": token}) tField = BeautifulSoup(loginResp.text, "html.parser").find(id="t") if tField is None: err = re.search(r"sErrTxt:'([^'\\]*(\\.[^'\\]*)*)'", loginResp.text) errMsg = "Couldn't retrieve t field from login response" if err: errMsg = re.sub(r"<.*?>", "", err.group(1)).replace("\\'", "'").replace("\\\\", "\\") raise SkypeAuthException(errMsg, loginResp) return tField.get("value")
def get_product_details(product): http = urllib3.PoolManager() response = http.request('GET', product.get('url')) if response.status != 200: print("requesting url failed with code: {0}".format(response.status)) return None else: dom_img = BeautifulSoup(response.data).find('a', {'class': 'zoom-img1'}) dom_price = BeautifulSoup(response.data).find('div', {'itemprop': 'price'}) product['details'] = { 'price' : dom_price.text.strip(), 'image_href' : dom_img.get('href'), 'image_title' : dom_img.get('title'), 'content_info': {} } for content_info in BeautifulSoup(response.data).findAll('div', {'class': 'cnt-info'}): content_parts = content_info.text.split(':') if len(content_parts) > 1: info_label = content_parts[0].strip().split('\n')[0] info_value = content_parts[1].strip() product['details']['content_info'][info_label] = info_value return product
def item_enclosure_url(self, item): """ Return an image for enclosure. """ try: url = item.image.url except (AttributeError, ValueError): img = BeautifulSoup(item.html_content, 'html.parser').find('img') url = img.get('src') if img else None self.cached_enclosure_url = url if url: url = urljoin(self.site_url, url) if self.feed_format == 'rss': url = url.replace('https://', 'http://') return url
def deserialize(self, world_xml): # noinspection PyBroadException try: world_xml = BeautifulSoup(world_xml, features="xml").world res_world = World(random.randint) res_world.setN(int(world_xml.get('size'))) org_mapper = MapOrganism() for org in world_xml.findAll('organism'): tmp = org_mapper.deserialize(org) if tmp is not None: res_world.add_organism(tmp) return res_world except: print("XML file corrupted") return None
def run(self): while True: try: poem_url = self.queue.get(False) poem_page = urllib.urlopen(poem_url).read() # SoupStrainer poetry_soup = BeautifulSoup(poem_page).find('div', class_='poem') title = poetry_soup.get('data-text') poem = poetry_soup.p.get_text(strip=True) body = poem self.complete.acquire() self.complete.poems.append([title, body]) self.complete.release() self.queue.task_done() except Queue.Empty: break
def main(): linkdict = get_tiku() # 获取首页所有课程列表 for name, links in linkdict.items(): for link in links: if link is not None: soup = BeautifulSoup("%s" % link).find("a") link = soup.get("href") # 单个课程url title = soup.text # 单个课程标题, 如: 第1章 集合与函数概念 types = get_types(link) # [(选择题, http://www.tikubaba.com/class-69-1.html), ...] for tp in types: type_page = tp[1].split("-")[0] + "-" + tp[1].split("-")[1] + "-%s.html" page_num = get_pages(tp[1]) for page in range(page_num): for url in get_page_items(type_page % str(page + 1)): # import pdb;pdb.set_trace() result = get_details(url) # 单道题目的答案
def sendCreds(self, user, pwd, params): # Now pass the login credentials over. loginResp = self.conn("POST", "{0}/ppsecure/post.srf".format(SkypeConnection.API_MSACC), params={"wa": "wsignin1.0", "wp": "MBI_SSL", "wreply": "https://lw.skype.com/login/oauth/proxy?client_id=578134&site_name=" "lw.skype.com&redirect_uri=https%3A%2F%2Fweb.skype.com%2F"}, cookies={"MSPRequ": params["MSPRequ"], "MSPOK": params["MSPOK"], "CkTst": str(int(time.time() * 1000))}, data={"login": user, "passwd": pwd, "PPFT": params["PPFT"]}) tField = BeautifulSoup(loginResp.text, "html.parser").find(id="t") if tField is None: err = re.search(r"sErrTxt:'([^'\\]*(\\.[^'\\]*)*)'", loginResp.text) errMsg = "Couldn't retrieve t field from login response" if err: errMsg = re.sub(r"<.*?>", "", err.group(1)).replace("\\'", "'").replace("\\\\", "\\") raise SkypeAuthException(errMsg, loginResp) return tField.get("value")
def login(): loginURL="http://jw.hzau.edu.cn/default2.aspx" checkURL="http://jw.hzau.edu.cn/default2.aspx" codeURL='http://jw.hzau.edu.cn/CheckCode.aspx' user_agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36" f=urllib.request.urlopen(loginURL) html=f.read().decode('gbk') soup=BeautifulSoup(html,'html.parser') soup=soup.find_all(attrs={'name':'__VIEWSTATE'})[0] viewstate=soup.get('value') f=urllib.request.urlopen(codeURL) path='D:\\aaaaa\\code.gif' fl=open(path,'wb') fl.write(f.read()) fl.close() img_code=input("please input code: ") headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'no-cache', 'Connection':'keep-alive', 'Content-Type':'application/x-www-form-urlencoded', 'Host':'jw.hzau.edu.cn', 'Origin':'http://jw.hzau.edu.cn', 'Pragma':'no-cache', 'Referer':'http://jw.hzau.edu.cn/default2.aspx', 'Upgrade-Insecure-Requests':1, 'User-Agent':user_agent, } form={ '__VIEWSTATE':viewstate, 'txtUserName':'******', 'TextBox2':'qq520.1314', 'txtSecretCode':img_code, 'RadioButtonList1':'学生', 'Button1':'', 'lbLanguage':'', 'hidPdrs':'', 'hidsc':'', } post_data=urllib.parse.urlencode(form).encode(encoding='utf-8') req=urllib.request.Request(checkURL,post_data,headers) f=urllib.request.urlopen(req)
def download_file(self, book_url: str): ##调用request函数把套图地址传进去会返回给我们一个response html = self.request(book_url) a_tag_list = BeautifulSoup(html.text, 'lxml').find_all('a', class_='current') for a_tag in a_tag_list: down_url = "http://www.en8848.com.cn/" + a_tag.get('href') html = self.request(down_url) title = BeautifulSoup(html.text, 'lxml').title.contents[0] title = str(title).replace("/", "-") a_tag_file = BeautifulSoup(html.text, 'lxml').find('a', id='dload') if a_tag_file is not None : file_url = "http://www.en8848.com.cn/e/DownSys/" + a_tag_file.get('href')[3:] print(title + " : " + file_url) html = self.request(file_url) f = open('/home/laomie/temp/enbooks/' + title + '.rar', 'ab') f.write(html.content) f.close()
def getViewAndCode(self): while self.viewstate is None: try: html = urllib.request.urlopen(self.loginAndCheckUrl, timeout=3).read().decode("gbk") soup = BeautifulSoup(html, "html.parser") soup = soup.find_all(attrs={"name": "__VIEWSTATE"})[0] self.viewstate = soup.get("value") except: print("the login.aspx has not response") continue while self.img_code is None: fl = open(self.codeImg, "wb") try: fl.write(urllib.request.urlopen(self.codeURL, timeout=3).read()) except: print("the code.aspx has not response") continue finally: fl.close() self.img_code = input("please input code: ") # must wait for fl.close()
def start_requests(self): print 'Preparing login...' # xsrf = Selector(response).xpath('//input[@name="_xsrf"]/@value').extract()[0] # print xsrf # FormRequeset.from_response is a function of Scrapy, to post data self.driver.get('https://www.douban.com/login') captcha_url = BeautifulSoup(self.driver.page_source, 'lxml').find(id='captcha_image') if captcha_url: captcha_url = captcha_url.get('src') code = requests.get(captcha_url) with open('/Users/shichangtai/Desktop/douban/code.jpg', 'wb') as f: f.write(code.content) captcha = raw_input('请输入图中的验证码:') captcha_field = self.driver.find_element_by_id("captcha_field") captcha_field.send_keys(captcha) username = self.driver.find_element_by_id("email") password = self.driver.find_element_by_id("password") username.send_keys(“******”) password.send_keys(“******”) self.driver.find_element_by_name("login").click() # return [FormRequest.from_response] #some pages cannot display without logging in. if self.start_urls: print self.start_urls print '------------Review crawl mode...----------------' self.driver.get(self.start_urls) review_url=BeautifulSoup(self.driver.page_source,'lxml').find(id='comments-section').find('span',class_='pl').a['href'] while(1): yield Request(review_url, self.parse_review, headers=self.headers, errback=self.errback_review) self.driver.get(review_url) if BeautifulSoup(self.driver.page_source, 'lxml').find(id='paginator').find_all('a')[-1].get_text()!=u'\u540e\u9875 >': break review_url=self.start_urls+'/comments'+BeautifulSoup(self.driver.page_source, 'lxml').find(id='paginator').find_all('a')[-1]['href'] else: print '---------------Top250 crawl mode...---------------' for i in range(10): temp_url='https://movie.douban.com/top250?start=%s&filter=' % (i*25) yield Request(temp_url,self.parse,headers=self.headers)
def getView(): headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'no-cache', 'Connection':'keep-alive', 'Content-Type':'application/x-www-form-urlencoded', 'Host':'jw.hzau.edu.cn', 'Origin':'http://jw.hzau.edu.cn', 'Pragma':'no-cache', 'Referer':'http://jw.hzau.edu.cn/xs_main.aspx?xh=2013307201006', 'Upgrade-Insecure-Requests':1, 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' } xkURL='http://jw.hzau.edu.cn/xf_xsqxxxk.aspx?xh=2013307201006&xm=%B3%CC%CA%E9%D2%E2&gnmkdm=N121111' req=urllib.request.Request(xkURL,headers=headers) f=urllib.request.urlopen(req) html=f.read().decode('gbk') print(html) soup=BeautifulSoup(html,'html.parser') soup=soup.find_all(attrs={'name':'__VIEWSTATE'})[0] viewstate=soup.get('value') return viewstate
def rawToFields(cls, raw={}): fields = super(SkypeCallMsg, cls).rawToFields(raw) listTag = BeautifulSoup(raw.get("content"), "html.parser").find("partlist") fields.update({"state": {"started": cls.State.Started, "ended": cls.State.Ended}[listTag.get("type")], "userIds": [], "userNames": []}) for partTag in listTag.find_all("part"): fields["userIds"].append(partTag.get("identity")) fields["userNames"].append(partTag.find("name").text) return fields
loginUrl = "https://www.facebook.com/login.php?login_attempt=1&lwv=110" # req = urllib2.Request(hosturl) soup = BeautifulSoup('<input class="boldest" Extremely bold >').input print soup.name logResult = urllib2.urlopen(hosturl,timeout=1000).read() soup = BeautifulSoup(logResult) #print soup.select("#login_form input") logInfo = { "email": "*****@*****.**", "pass":"******" } for i in soup.select("#login_form input"): print i tag =BeautifulSoup(str(i)).input if tag.get("name") and tag.get("value"): logInfo[tag['name']] = tag.get("value") print tag['name'] + "\t" + str(tag.get("value")) postData = urllib.urlencode(logInfo) print postData headers = { "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36", "Host": "202.203.222.202", "content-type":"application/x-www-form-urlencoded", "referer":"https://www.facebook.com/?stype=lo&jlou=AfeSfaeiFvsh5xaSSLIuKbHS0F-1AdSjGCnU5GOP8CBizaJ4XrXli8EJH3r_Ws4vPMeDAHJhu_D63lhaUKVQke3GIbC-kcxT1244kgOo7mcROw&smuh=41755&lh=Ac_3mzPw-4ESY0H_", "Upgrade-Insecure-Requests": 1 }
def check_best_practices(domain, protocol): print '<h2>General SEO Best Practices</h2><ul>', # check for robots.txt url = protocol + '://' + domain + '/robots.txt' r = requests.get(url,allow_redirects=False, verify=False) if r.status_code==200: print '<li class="good">Robots.txt: Found</li>', else: print '<li class="bad">Robots.txt: Not Found</li>', # check for sitemap.xml url = protocol + '://' + domain + '/sitemap.xml' r = requests.get(url,allow_redirects=False,verify=False) if r.status_code==200: '<li class="good">sitemap.xml: Found</li>', else: print '<li class="bad">sitemap.xml: Not Found</li>', # check for sitemap.xml.gz url = protocol + '://' + domain + '/sitemap.xml.gz' r = requests.get(url,allow_redirects=False,verify=False) if r.status_code==200: '<li class="good">sitemap.xml.gz: Found</li>', else: print '<li class="bad">sitemap.xml.gz: Not Found</li>', # check for sitemap.gz url = protocol + '://' + domain + '/sitemap.gz' r = requests.get(url,allow_redirects=False,verify=False) if r.status_code==200: '<li class="good">sitemap.xml.gz: Found</li>', else: print '<li class="bad">sitemap.xml.gz: Not Found</li>', #fetch the home page response for the rest of analysis url = protocol + '://' + domain + '/' r = requests.get(url,allow_redirects=False,verify=False) soup = BeautifulSoup(r.content, 'lxml') # check for responsive setup in <meta> tag meta_viewport = False for meta in soup.find_all('meta'): if meta.get('name') == 'viewport': meta_viewport = True print '<li class="good">Meta-Viewport found: ' + meta.get('content') + '</li>', if meta_viewport== False: print '<li class="bad">Meta-Viewport is not defined</li>', # check for unicode content type meta_unicode = False if 'Content-Type' in r.headers and r.headers['Content-Type'].find('charset') > -1: print '<li class="good">Character Encoding detected in response headers.</li>', meta_unicode = True else: for meta in soup.find_all('meta'): if meta.get('charset'): meta_unicode = True print '<li class="good">Character Encoding detected in <meta> tags</li>', if meta_viewport== False: print '<li class="bad">Charset definition not found.</li>', # check for language setting if soup.get('lang'): print '<li class="good">Language details specific in <html> tag.</li>', else: print '<li class="bad">Language is unspecified in <html> tag.</li>', # check for vary:user-agent if 'Vary' in r.headers and r.headers['Vary'].find('User-Agent') > -1: print '<li class="good"><i>Vary: User-Agent</i> header detected.</li>', else: print '<li class="bad"><i>Vary: User-Agent</i> header not detected.</li>', print '</ul>',