def open(self, book_id=None): if book_id: self.book_id = book_id if not self.book_id: raise Exception('Book id not set') self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id)) sz_mult = 1.0 / (1024**2) result = u'%.1f' % (self.size * sz_mult) self.size = u'<0.1' if result == u'0.0' else result self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id), 'r') soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml')) oebps = soup.findAll('rootfile')[0]['full-path'] folder = oebps.rfind(os.sep) self.oebps_folder = '' if folder == -1 else oebps[:folder + 1] # 找到oebps的文件夹名称 oebps_content = self.f.read(oebps) self.read_doc_props(oebps_content) opf_bs = BeautifulStoneSoup(oebps_content) ncx = opf_bs.findAll('item', {'id': 'ncx'})[0] ncx = self.oebps_folder + ncx['href'] # 找到ncx的完整路径 ncx_bs = BeautifulStoneSoup(self.f.read(ncx)) self.chapters = [ (nav.navlabel.text, nav.content['src']) for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint') ] self.cover_href = self.chapters[0][1] # 封面路径
def __call__(self, content: str) -> str: filenames = [] soup = BeautifulStoneSoup(content) links = soup.table.findAll('a') for link in links: filenames.append(link.text) return '\n'.join(filenames)
def HTMLEntitiesToUnicode(text): """ Converts HTML entities to unicode. For example '&' becomes '&'. """ text = BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES) return text
def trigger_w(self, msg): "Usage: w <search term>. Prints a short description of the corresponding wikipedia article." if len(msg.args) == 0: self.bot.notice(msg.nick, "Please specify a search term") return params = { 'action': 'opensearch', 'format': 'xml', 'limit': '2', 'search': ' '.join(msg.args) } url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language) response = BeautifulStoneSoup(requests.post(url, data=params).text) # Damn BS4 is case sensitive, hence all the regex. if response.find(re.compile('text', re.I)): index = 0 if "may refer to:" in response.find(re.compile('description', re.I)).string: index = 1 info = response.find_all(re.compile('description', re.I))[index].string.strip() url = response.find_all(re.compile('url', re.I))[index].string short_url = self.shorten(url) message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url) self.bot.privmsg(msg.channel, message) else: self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
def fuseReferences(doc, ref): """ """ prevref = doc["references"][-1] doc["metadata"]["ref_replace_list"] = doc["metadata"].get( "ref_replace_list", {}) id = "" try: id = ref["id"] if not id: id = prevref["id"] if isinstance(id, six.string_types): id = "ref" + str(len(doc["references"]) + 1) elif isinstance(id, int): id = id + 1 except: id = "ref" + str(len(doc["references"]) + 1) doc["metadata"]["ref_replace_list"][id] = prevref["id"] doc["references"].remove(prevref) fullstring = re.sub(r"</reference>", "", prevref["xml"], 0, re.IGNORECASE) fullstring += re.sub(r"<reference.+?>", "", ref.__repr__(), 0, re.IGNORECASE) ## ref=BeautifulStoneSoup(prevref["xml"]+ref.__repr__()) ref = BeautifulStoneSoup(fullstring).find("reference") processReferenceXML(ref, doc, False)
def loadJATSSentence(self, s, newDocument, par_id, section_id): """ Loads a JATS sentence (ready split) :param s: the plain text of the sentence (with all tags inside, e.g. <xref>) :param newDocument: SciDoc :param par_id: id of the paragraph containing this sentence :param section_id: id of the section containing the paragraph """ newSent = newDocument.addSentence(par_id, "") s_soup = BeautifulStoneSoup(s) refs = s_soup.findAll("xref", {"ref-type": "bibr"}) citations_found = [] for r in refs: citations_found.extend( self.loadJATSCitation(r, newSent["id"], newDocument, section=section_id)) non_refs = s_soup.findAll( lambda tag: tag.name.lower() == "xref" and "ref-type" in tag and tag["ref-type"].lower() != "bibr") for nr in non_refs: nr.name = "inref" newSent["citations"] = [acit["id"] for acit in citations_found] # TODO replace <xref> tags with <cit> tags newSent["text"] = newDocument.extractSentenceTextWithCitationTokens( s_soup, newSent["id"]) ## print(newSent["text"]) # deal with many citations within characters of each other: make them know they are a cluster # TODO cluster citations? Store them in some other way? newDocument.countMultiCitations(newSent)
def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("<b />") self.assertEqual("<b/>", str(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
def parse(self, content: str) -> str: """Parses web content""" filenames = [] soup = BeautifulStoneSoup(content) links = soup.table.findAll('a') for link in links: filenames.append(link['href']) return '\n'.join(filenames)
def __init__(self): try: # Получение xml в string формате with req.urlopen(self.URL) as open_url: soup = BeautifulStoneSoup(open_url.read()) self.FEED = (series.series_from_xml(soup, 'item'))[2:] except Exception as e: logger.error("Cannot get a XML-file: %s" % e)
def response_soup(self): "Returns a BeautifulSoup object of the response." if not self._response_soup: self._response_soup = BeautifulStoneSoup( str(self._response_content, encoding='utf-8')) return self._response_soup
def get_all_urls(self): """Возвращает список url""" list_of_urls = [] for url in self.urls: request = self.session.get(url, headers=self.headers) soap = BeautifulStoneSoup(request.content) urls = soap.find_all('loc') list_of_urls += [url.next_element for url in urls] return list_of_urls
def render(self, context): fancount = '' fb_api_url = 'http://api.facebook.com/restserver.php' tw_api_url = 'http://api.twitter.com' cache_key = '' cache_time = 1800 if self.service == "facebook": query = '%s?method=facebook.fql.query&query=SELECT%%20fan_count%%20FROM%%20page%%20WHERE%%20page_id=%s' xml_path = query % (fb_api_url, self.service_id) cache_key = md5(xml_path.encode()).hexdigest() fancount = cache.get(cache_key) if not fancount: try: xml = urlopen(xml_path) content = xml.read() soup = BeautifulStoneSoup(content) nodes = soup.find_all('page') for node in nodes: fancount = node.fan_count.string cache.set(cache_key, fancount, cache_time) except: pass if self.service == "twitter": query = "%s/1/users/show/%s.xml" xml_path = query % (tw_api_url, self.service_id) cache_key = md5(xml_path.encode()).hexdigest() fancount = cache.get(cache_key) if not fancount: try: xml = urlopen(xml_path) content = xml.read() soup = BeautifulStoneSoup(content) nodes = soup.find_all('user') for node in nodes: fancount = node.followers_count.string cache.set(cache_key, fancount, cache_time) except: pass return fancount
def parseCermineXML(self, xml_string): """ This is meant to load the full output from Cermine, whichever it may be. Currently only reads references. """ soup = BeautifulStoneSoup(xml_string, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) ## print(xml_string) references=self.readReferences(soup) # TODO implement reading the rest of the Cermine/ParsHed tagging return references
def read(self, xml, identifier): """ Load a JATS/NLM (PubMed) XML into a SciDoc. :param xml: full xml string :type xml: basestring :param identifier: an identifier for this document, e.g. file name If an actual full path, the path will be removed from it when stored :type identifier: basestring :returns: :class:`SciDoc <SciDoc>` object :rtype: SciDoc """ # this solves a "bug" in BeautifulStoneSoup with "sec" tags BeautifulStoneSoup.NESTABLE_TAGS["sec"] = [] #xml=fixNumberCitationsXML(xml) soup = BeautifulStoneSoup(xml) # Create a new SciDoc to store the paper newDocument = SciDoc() metadata = newDocument["metadata"] metadata["filename"] = os.path.basename(identifier) metadata["original_citation_style"] = detectCitationStyle(xml) body = soup.find("body") if not body: # TODO: Make the error handling less terrible debugAddMessage(newDocument, "error", "NO <BODY> IN THIS PAPER! file: " + identifier) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID() return newDocument # Load metadata, either from corpus or from file self.loadJATSMetadataFromPaper(newDocument, soup) metadata["guid"] = cp.Corpus.generateGUID(metadata) # Load all references from the XML back = soup.find("back") if back: ref_list = back.find("ref-list") # other things in <back> like appendices: ignore them for now if ref_list: for ref in ref_list.findAll("ref"): self.loadJATSReference(ref, newDocument) newDocument.updateReferences() # Load Abstract self.loadJATSAbstract(soup, newDocument) for sec in body.findChildren("sec", recursive=False): self.loadJATSSection(sec, newDocument, "root") newDocument.updateAuthorsAffiliations() return newDocument
def soup_maker(fh): """ Takes a file handler returns BeautifulSoup""" try: from bs4 import BeautifulSoup soup = BeautifulSoup(fh, "lxml") for tag in soup.find_all(): tag.name = tag.name.lower() except ImportError: from bs4 import BeautifulStoneSoup soup = BeautifulStoneSoup(fh) return soup
def login(self): """ Read greeting """ greeting = self.read() soup = BeautifulStoneSoup(greeting, 'lxml') svid = soup.find('svid') version = soup.find('version') print("Connected to %s (v%s)\n" % (svid.text, version.text)) """ Login """ xml = commands.login % self.config if not self.cmd(xml, silent=True): exit(1)
def getDetailsForSerieByID(self, serieName, serieID): url = SERIE_DETAILS_URL % (urllib.quote(serieID)) try: # Change the User Agent USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) req = urllib2.Request(url) req.add_header('User-Agent', USER_AGENT) resp = opener.open(req) soup = BeautifulStoneSoup(resp.read()) resp.close() for banner in soup.banners.findAll('banner'): if banner.language.string == 'en': if not 'Fanart' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'fanart': KNOWN_SHOWS[serieName]['Fanart'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['FanartThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Poster' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'poster': KNOWN_SHOWS[serieName]['Poster'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['PosterThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Season' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'season': KNOWN_SHOWS[serieName]['Season'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['SeasonThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Series' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'series': KNOWN_SHOWS[serieName]['Series'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['SeriesThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) return KNOWN_SHOWS[serieName] except: print 'Error: ' + url return None
def response_soup(self): "Returns a BeautifulSoup object of the response." try: from bs4 import BeautifulStoneSoup except ImportError: from BeautifulSoup import BeautifulStoneSoup log.warn('DeprecationWarning: BeautifulSoup 3 or earlier is deprecated; install bs4 instead\n') if not self._response_soup: self._response_soup = BeautifulStoneSoup( self._response_content.decode('utf-8') ) return self._response_soup
def HTMLEntitiesToUnicode(self, text): """ Converts HTML entities to unicode. For example '&' becomes '&'. Args: text: HTML laden text to convert to unicode Returns: String converted to unicode """ try: text = str(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES)) return text except Exception as e: print("error formatting string: %s ; Errors: %s" % text, e) return None
def extract_info_from_html(): from bs4 import BeautifulStoneSoup import re html = open("data/url.html").read() soup = BeautifulStoneSoup(html) inputTag = soup.findAll("a") inputTag = str(inputTag).split(",") m = [re.search(" +href=\"(.*?)\"", i) for i in inputTag] urls = [i.group(1) for i in m] code = [ i[9:-9].replace("<", "") for i in str(soup.findAll('strong')).split(",") ] city = [ i.split('<span class="uni-code">')[0].replace("\t", "").replace( "</span>", "").replace("\n", "") for i in html.split( '<i class="fa fa-map-marker" aria-hidden="true"></i>')[1:] ] abbr = [ i.split('</div>')[0].replace("\t", "").replace("</span>", "").replace("\n", "") for i in html.split('<div class="name-group">')[1::2] ] # ADD CODE TO UNI_INFO map_abbr_code = [{ "abbr": m, "code": n } for m, n in zip(abbr, code) if m != ""] import json uni = json.load(open("data/university.json")) len(uni) new_uni = [] abbrs = [] for i in uni: if (i["abbr"] in abbrs): continue else: for j in map_abbr_code: if (j["abbr"] == i["abbr"]): i["code"] = j["code"] break new_uni.append(i) abbrs.append(i["abbr"]) with open('data/university_add_code.json', 'w') as outfile: json.dump(new_uni, outfile, ensure_ascii=False, indent=4)
def parse_data(self, url): '''Собирает данные в словарь''' request = self.session.get(url, headers=self.headers) if request.status_code == 200: soup = BeautifulStoneSoup(request.content) if not (bool(soup.find('div', {"class": 'error404__text'})) or bool(soup.find('div', {"class": 'nothing-search'})) or bool(soup.find('div', {"id": 'productList'}))): try: name_of_product = soup.find('h1').next_element except Exception: raise Format_Exeption('name', url) try: price_for_all = soup.find( 'span', { "class": "item__price item__price--normal-left" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_all = "Нет в наличии" try: price_for_registered = soup.find( 'span', { "class": "item__price item__price--red-bold" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_registered = "Нет в наличии" try: reference = soup.findAll( 'div', {"class": "item__card-info-articul"}) reference = reference[1].next_element reference = str(reference).split()[2].replace("-", '') except Exception: reference = "Нет номера" final = { "name_of_product": name_of_product, "price_for_all": price_for_all, "price_for_registered": price_for_registered, "reference": reference, "url": url } return final else: print("Не тот формат, вот ссылка {0}".format(url)) raise Format_Exeption else: raise Connection_Exception
def parse_data(self, url): '''Собирает данные в словарь''' request = self.session.get(url, headers=self.headers) if request.status_code == 200: soap = BeautifulStoneSoup(request.content) if not (bool(soap.find('table', {"class": 'map-columns'})) or bool( soap.find('div', {"class": 'col-md-12 catalog-items'}))): try: name_of_product = soap.find('h1', { 'class': 'title' }).next_element except Exception: raise Format_Exeption('name', url) try: price_for_all = soap.find('div', { "class": "price" }).next_element.replace(" ", "").replace("\n", "")[:-1] except Exception: price_for_all = "Нет в наличии" try: price_for_rozn = soap.find('div', { "class": "rozn-price" }).next_element.replace(" ", "").replace("\n", "")[:-1] price_for_rozn = ''.join( filter(str.isdigit, price_for_rozn)) except Exception: price_for_rozn = "Нет в наличии" try: reference = soap.find('div', { 'class': 'article' }).next_element.replace("-", '')[9:] except Exception: reference = "Нет номера" final = { "name_of_product": name_of_product, "price_for_all": price_for_all, "price_for_registered": price_for_rozn, "reference": reference, "url": url } return final else: print("Не тот формат, вот ссылка {0}".format(url)) raise Format_Exeption else: raise Connection_Exception
def loadRefAuthorsFromSentence(sentence): """ Converts <refauthor> tags to proper citations """ cfc_citations = [] for match in re.findall(r"<refauthor.*?</refauthor>", sentence["text"], re.IGNORECASE): soup = BeautifulStoneSoup(match) new_cit = {"parent_s": sentence["id"]} avoid_list = {"links"} for key in [key[0] for key in soup.attrs if key[0] not in avoid_list]: new_cit[key] = soup[key] cfc_citations.append(new_cit) return cfc_citations
def login(self, account, password): self.count += 1 form_data = { '__LASTFOCUS': '', '__VIEWSTATE': '/wEPDwUKLTYyODEyMzMzMGRkL3e45wwAbXRMklziclTsgEdzyPEwTYlRK/82rSW9ia4=', '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__EVENTVALIDATION': '/wEdAAU54OdiNscYklAhFFRo5mKvR1LBKX1P1xh290RQyTesRe73C5Hghb+Z/bZTMreJjC5a26FEoUIR27AQFJNWWcL9lvD3Xdq7ldHy+JQ2tBNJGaOgZ5o+9oUn7QAVYx4o/XgeS3eF3mvkRXiWGnMfaCgO', 'UserName': account, 'UserPass': password, 'CheckCode': self.get_capture(), 'Btn_OK': '(unable to decode value)', } data = urllib.parse.urlencode(form_data).encode(encoding='utf-8') resquest = urllib.request.Request(self.login_url, data, self.headers) response = self.opener.open(resquest) login_content = response.read().decode(self.character).encode("utf-8") ss = str((BeautifulStoneSoup(login_content))) ''' if (len(ss)) > 4105: ss = ss.split('alert')[1] ss = ss.split(';')[0] print (ss) if ss == "('验证码输入错误!')": if self.count is 1: self.cookie = self.cookie.clear() return 1 elif ss == "('用户名或密码错误!')": #print ("用户名错误\n") self.cookie = self.cookie.clear() return 2 ''' #print(ss) if re.findall('验证码输入错误.', ss): print('验证码输入错误') if self.count is 1: self.cookie = self.cookie.clear() return 1 elif re.findall('用户名或密码错误.', ss): print('用户名或密码错误') self.cookie = self.cookie.clear() return 2 else: return 3
def read(self, xml, filename): """ Load a document from the Athar corpus Args: xml: full xml string """ ## # this solves a "bug" in BeautifulStoneSoup with "sec" tags ## BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] soup = BeautifulStoneSoup(xml) paper_data_node = soup.find("div", {"class": "dstPaperData"}) paper_data = { "id": paper_data_node.text, "title": "", "authors": "", } title = paper_data_node.find("div", {"class": "dstPaperTitle"}) if title: paper_data["title"] = title.text authors = paper_data_node.find("div", {"class": "dstPaperAuthors"}) if authors: author_chunks = title.text.split(";") for author in author_chunks: chunks = author.split(",") author_dict = {"given": chunks[1], "family": chunks[0]} paper_data["authors"] = author_dict ## print(paper_data) all_contexts = [] all_docs = [] document_nodes = soup.findAll("table", {"class": "srcPaper"}) for index, document_node in enumerate(document_nodes): try: doc, contexts = self.loadDocumentNode(document_node, paper_data, index) all_docs.append(doc) all_contexts.extend(contexts) except ValueError: print("Error:", sys.exc_info()[1]) break return all_docs, all_contexts
def run(self, file_name, user, **kwargs): """ Parse the given xml file using BeautifulSoup. Save all Article, Redirect and Page objects. """ f = open(file_name, 'r') xml = f.read() f.close() soup = BeautifulStoneSoup(xml) items = soup.find_all('item') for item in items: post_type = item.find('wp:post_type').string post_status = item.find('wp:status').string if post_type == 'attachment': get_media(item, user) # Note! This script assumes all the attachments come before # posts and pages in the xml. If this ends up changing, # do two loops, one with attachments and the second with posts and pages. elif post_type == 'post' and post_status == 'publish': get_posts(item, user) elif post_type == 'page' and post_status == 'publish': get_pages(item, user) if user.email: context = { 'SITE_GLOBAL_SITEDISPLAYNAME': get_setting('site', 'global', 'sitedisplayname'), 'SITE_GLOBAL_SITEURL': get_setting('site', 'global', 'siteurl'), } subject = ''.join( render_to_string( template_name=('notification/wp_import/short.txt'), context=context).splitlines()) body = render_to_string( template_name=('notification/wp_import/full.html'), context=context) #send_mail(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email], fail_silently=False) email = EmailMessage(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email]) email.content_subtype = 'html' email.send(fail_silently=True)
def parse_metadata(filename): dims = [ None, ] * len(labels) with open(filename, "r") as myfile: xml = BeautifulStoneSoup(myfile.read()) maxvals = [] for channel in xml.Channels: maxvals.append(float(channel["Max"])) for setting in xml.ScannerSetting: for i, l in enumerate(labels): if setting["Description"] == l: dims[i] = float(setting["Variant"]) return maxvals, dims
def cmd(self, cmd, silent=False): self.write(cmd) data = self.read() soup = BeautifulStoneSoup(data, 'lxml') response = soup.find('response') result = soup.find('result') try: code = int(result.get('code')) except AttributeError: print("\nERROR: Could not get result code, exiting.") exit(1) if not silent or code not in (1000, 1300, 1500): print("- [%d] %s" % (code, result.msg.text)) if code == 2308: return False if code == 2502: return False return response
def get_info(self, account): request = urllib.request.Request(self.info_url) response = self.opener.open(request) content = response.read().decode(self.character).encode("utf-8") file = open('new/' + account + '.html', 'wb') file.write(content) file.close() detail_html = BeautifulStoneSoup(content) img_url = detail_html.find(id="Student11_Image1") link = img_url.get('src') link = link[2:] pto_url = 'http://szjy.swun.edu.cn/Sys/SystemForm' + link pto_url = pto_url.replace('照片', '%D5%D5%C6%AC') urllib.request.install_opener(opener=self.opener) img_name = 'photos/' + account + '.jpg' urllib.request.urlretrieve(pto_url, img_name) self.cookie = self.cookie.clear()
def check_updates(self): logger.info("Start checking updates") try: xml_file = BeautifulStoneSoup(req.urlopen(self.URL).read()) if xml_file: series_list = series.series_from_xml(xml_file) updates = [ elem for elem in series_list if elem not in self.FEED ] if len(self.FEED) == 12: self.FEED = updates + self.FEED[:-len(updates)] else: self.FEED = updates + self.FEED updates.reverse() return updates else: return [] except Exception as e: logger.error("Check failed : %s" % e) return []