def trigger_w(self, msg): "Usage: w <search term>. Prints a short description of the corresponding wikipedia article." if len(msg.args) == 0: self.bot.notice(msg.nick, "Please specify a search term") return params = { 'action': 'opensearch', 'format': 'xml', 'limit': '2', 'search': ' '.join(msg.args) } url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language) response = BeautifulStoneSoup(requests.post(url, data=params).text) # Damn BS4 is case sensitive, hence all the 'regex's. if response.find(re.compile('text', re.I)): index = 0 if "may refer to:" in response.find(re.compile('description', re.I)).string: index = 1 info = response.find_all(re.compile('description', re.I))[index].string.strip() url = response.find_all(re.compile('url', re.I))[index].string short_url = self.shorten(url) message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url) self.bot.privmsg(msg.channel, message) else: self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
def trigger_w(self, msg): "Usage: w <search term>. Prints a short description of the corresponding wikipedia article." if len(msg.args) == 0: self.bot.notice(msg.nick, "Please specify a search term") return params = { 'action': 'opensearch', 'format': 'xml', 'limit': '2', 'search': ' '.join(msg.args) } url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language) response = BeautifulStoneSoup(requests.post(url, data=params).text) # Damn BS4 is case sensitive, hence all the regex. if response.find(re.compile('text', re.I)): index = 0 if "may refer to:" in response.find(re.compile('description', re.I)).string: index = 1 info = response.find_all(re.compile('description', re.I))[index].string.strip() url = response.find_all(re.compile('url', re.I))[index].string short_url = self.shorten(url) message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url) self.bot.privmsg(msg.channel, message) else: self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
def read(self, xml, identifier): """ Load a JATS/NLM (PubMed) XML into a SciDoc. :param xml: full xml string :type xml: basestring :param identifier: an identifier for this document, e.g. file name If an actual full path, the path will be removed from it when stored :type identifier: basestring :returns: :class:`SciDoc <SciDoc>` object :rtype: SciDoc """ # this solves a "bug" in BeautifulStoneSoup with "sec" tags BeautifulStoneSoup.NESTABLE_TAGS["sec"] = [] #xml=fixNumberCitationsXML(xml) soup = BeautifulStoneSoup(xml) # Create a new SciDoc to store the paper newDocument = SciDoc() metadata = newDocument["metadata"] metadata["filename"] = os.path.basename(identifier) metadata["original_citation_style"] = detectCitationStyle(xml) body = soup.find("body") if not body: # TODO: Make the error handling less terrible debugAddMessage(newDocument, "error", "NO <BODY> IN THIS PAPER! file: " + identifier) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID() return newDocument # Load metadata, either from corpus or from file self.loadJATSMetadataFromPaper(newDocument, soup) metadata["guid"] = cp.Corpus.generateGUID(metadata) # Load all references from the XML back = soup.find("back") if back: ref_list = back.find("ref-list") # other things in <back> like appendices: ignore them for now if ref_list: for ref in ref_list.findAll("ref"): self.loadJATSReference(ref, newDocument) newDocument.updateReferences() # Load Abstract self.loadJATSAbstract(soup, newDocument) for sec in body.findChildren("sec", recursive=False): self.loadJATSSection(sec, newDocument, "root") newDocument.updateAuthorsAffiliations() return newDocument
def read(self, xml, filename): """ Load a document from the Athar corpus Args: xml: full xml string """ ## # this solves a "bug" in BeautifulStoneSoup with "sec" tags ## BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] soup = BeautifulStoneSoup(xml) paper_data_node = soup.find("div", {"class": "dstPaperData"}) paper_data = { "id": paper_data_node.text, "title": "", "authors": "", } title = paper_data_node.find("div", {"class": "dstPaperTitle"}) if title: paper_data["title"] = title.text authors = paper_data_node.find("div", {"class": "dstPaperAuthors"}) if authors: author_chunks = title.text.split(";") for author in author_chunks: chunks = author.split(",") author_dict = {"given": chunks[1], "family": chunks[0]} paper_data["authors"] = author_dict ## print(paper_data) all_contexts = [] all_docs = [] document_nodes = soup.findAll("table", {"class": "srcPaper"}) for index, document_node in enumerate(document_nodes): try: doc, contexts = self.loadDocumentNode(document_node, paper_data, index) all_docs.append(doc) all_contexts.extend(contexts) except ValueError: print("Error:", sys.exc_info()[1]) break return all_docs, all_contexts
def get_info(self, account): request = urllib.request.Request(self.info_url) response = self.opener.open(request) content = response.read().decode(self.character).encode("utf-8") file = open('new/' + account + '.html', 'wb') file.write(content) file.close() detail_html = BeautifulStoneSoup(content) img_url = detail_html.find(id="Student11_Image1") link = img_url.get('src') link = link[2:] pto_url = 'http://szjy.swun.edu.cn/Sys/SystemForm' + link pto_url = pto_url.replace('照片', '%D5%D5%C6%AC') urllib.request.install_opener(opener=self.opener) img_name = 'photos/' + account + '.jpg' urllib.request.urlretrieve(pto_url, img_name) self.cookie = self.cookie.clear()
def parse_data(self, url): '''Собирает данные в словарь''' request = self.session.get(url, headers=self.headers) if request.status_code == 200: soup = BeautifulStoneSoup(request.content) if not (bool(soup.find('div', {"class": 'error404__text'})) or bool(soup.find('div', {"class": 'nothing-search'})) or bool(soup.find('div', {"id": 'productList'}))): try: name_of_product = soup.find('h1').next_element except Exception: raise Format_Exeption('name', url) try: price_for_all = soup.find( 'span', { "class": "item__price item__price--normal-left" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_all = "Нет в наличии" try: price_for_registered = soup.find( 'span', { "class": "item__price item__price--red-bold" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_registered = "Нет в наличии" try: reference = soup.findAll( 'div', {"class": "item__card-info-articul"}) reference = reference[1].next_element reference = str(reference).split()[2].replace("-", '') except Exception: reference = "Нет номера" final = { "name_of_product": name_of_product, "price_for_all": price_for_all, "price_for_registered": price_for_registered, "reference": reference, "url": url } return final else: print("Не тот формат, вот ссылка {0}".format(url)) raise Format_Exeption else: raise Connection_Exception
def parse_data(self, url): '''Собирает данные в словарь''' request = self.session.get(url, headers=self.headers) if request.status_code == 200: soap = BeautifulStoneSoup(request.content) if not (bool(soap.find('table', {"class": 'map-columns'})) or bool( soap.find('div', {"class": 'col-md-12 catalog-items'}))): try: name_of_product = soap.find('h1', { 'class': 'title' }).next_element except Exception: raise Format_Exeption('name', url) try: price_for_all = soap.find('div', { "class": "price" }).next_element.replace(" ", "").replace("\n", "")[:-1] except Exception: price_for_all = "Нет в наличии" try: price_for_rozn = soap.find('div', { "class": "rozn-price" }).next_element.replace(" ", "").replace("\n", "")[:-1] price_for_rozn = ''.join( filter(str.isdigit, price_for_rozn)) except Exception: price_for_rozn = "Нет в наличии" try: reference = soap.find('div', { 'class': 'article' }).next_element.replace("-", '')[9:] except Exception: reference = "Нет номера" final = { "name_of_product": name_of_product, "price_for_all": price_for_all, "price_for_registered": price_for_rozn, "reference": reference, "url": url } return final else: print("Не тот формат, вот ссылка {0}".format(url)) raise Format_Exeption else: raise Connection_Exception
def processCitationXML(intext): """ Extract the authors, date of an in-text citation <ref> from XML dom """ if isinstance(intext, six.string_types): xml = BeautifulStoneSoup(intext) else: xml = intext if not xml: return None, None authors = [] for a in xml.findAll("refauthor"): authors.append(a.text) date = xml.find("date") if date: date = cleanxml(date.__repr__()) else: date = "" if authors == [] or date == "": return None, None else: return authors, date
def loadSciXML(filename): """ Load a Cambridge-style SciXML """ def extractSentenceText(s, newSent_id, doc): """ Returns a printable representation of the sentence where all references are now placeholders with numbers """ global ref_rep_count ref_rep_count = 0 newSent = doc.element_by_id[newSent_id] def repFunc(match): """ """ global ref_rep_count ref_rep_count += 1 res = " <CIT ID=" + str( doc.citation_by_id[newSent["citations"][ref_rep_count - 1]]["id"]) + " />" return res text = s.renderContents() text = re.sub(r"<ref.*?</ref>", repFunc, text, 0, re.IGNORECASE | re.DOTALL) text = re.sub(r"</?refauthor>", "", text, 0, re.IGNORECASE | re.DOTALL) return text def loadStructureProcessPara(p, newDocument, parent): newPar_id = newDocument.addParagraph(parent) for s in p.findChildren("s"): newSent_id = newDocument.addSentence(newPar_id, "") newSent = newDocument.element_by_id[newSent_id] loadAttributesIfPresent(s, ["ia", "az", "refid"], newSent) refs = s.findAll("ref") num = len(newDocument["citations"]) + 1 ## for cit in citations: ## r["citation_id"]=num ## num+=1 loaded_refs = [ loadCitation(r, newSent_id, newDocument, parent) for r in refs ] newSent["citations"] = [aref["id"] for aref in loaded_refs] newSent["text"] = extractSentenceText(s, newSent_id, newDocument) newDocument.countMultiCitations( newSent ) # deal with many citations within characters of each other: make them know they are a cluster TODO cluster them return newPar_id def loadStructureProcessDiv(div, newDocument): header = div.find("header") if not header: header_id = 0 header_text = "" else: header_id = header["id"] or 0 header_text = re.sub(r"</?header.*?>", "", header.__repr__()) newSection_id = newDocument.addSection("root", header_text, header_id) for p in div.findAll("p"): newPar_id = loadStructureProcessPara(p, newDocument, newSection_id) def loadMetadataIfExists(branch, key, doc): meta = branch.find(key) if meta: doc["metadata"][key] = meta.text def loadAttributesIfPresent(branch, attributes, sent): """ For each element in attributes, if present in branch, it is added to sent """ for a in attributes: if a in branch: sent[a] = branch[a] def loadMetadata(newDocument, paper, fileno): """ Does all the painful stuff of trying to recover metadata from inside a badly converted SciXML file """ title = paper.findChildren("title") newDocument["metadata"]["title"] = title[0].text if len( title) > 0 else "NO TITLE" if fileno == "": fileno = paper.find("fileno").text newDocument["metadata"]["fileno"] = fileno authors = [] meta = soup.find("metadata") if not meta: debugAddMessage(newDocument, "error", "NO METADATA IN DOCUMENT! file:" + filename) return newDocument for a in meta.findChildren("author"): authors.append(processPlainTextAuthor(a.text)) if authors == []: authorlist = soup.find("authorlist") if authorlist: for author in authorlist.findChildren("refauthor"): authors.append(author.text) if authors == []: authors = extractAuthorsFromAuthorlist(authorlist) appeared = meta.find("appeared") if appeared: loadMetadataIfExists(appeared, "conference", newDocument) loadMetadataIfExists(appeared, "year", newDocument) newDocument["metadata"]["authors"] = authors newDocument["metadata"]["year"] = meta.find("year").text def sanitizeString(s, maxlen=200): s = s.replace("\t", " ") s = s[:maxlen] return s def makeSureValuesAreReadable(newDocument): newDocument["metadata"]["title"] = sanitizeString( newDocument["metadata"]["title"]) newAuthors = [] for author in newDocument["metadata"]["authors"]: newAuthors.append(sanitizeString(author, 70)) newDocument["metadata"]["authors"] = newAuthors newSurnames = [] for surname in newDocument["metadata"]["surnames"]: newSurnames.append(sanitizeString(surname, 25)) newDocument["metadata"]["surnames"] = newSurnames newDocument["metadata"]["year"] = sanitizeString( newDocument["metadata"]["year"]) if "conference" in newDocument["metadata"]: newDocument["metadata"]["conference"] = sanitizeString( newDocument["metadata"]["conference"]) def matchCitationsWithReferences(newDocument): """ Match each citation with its reference """ allcitations = [] for s in newDocument.allsentences: for citation_id in s["citations"]: cit = newDocument.citation_by_id[citation_id] if cit["ref_id"] != 0: # the citation already has a matching reference id in the original document, use it match = findMatchingReferenceByOriginalId( cit["ref_id"], newDocument) if not match: ## print cit match = newDocument.matchReferenceById(cit["ref_id"]) else: # attempt to guess which reference the citation should point to match = matchCitationWithReference(cit["original_text"], newDocument) if match: # whatever the previous case, make sure citation points to the ID of its reference cit["ref_id"] = match["id"] match["citations"].append( cit["id"] ) # add the citation ID to the reference's list of citations cit.pop("authors", "") cit.pop("date", "") cit.pop("original_text", "") else: debugAddMessage( newDocument, "notes", "NO MATCH for CITATION in REFERENCES: " + cleanxml(cit["original_text"]) + ", ") pass # main loadSciXML text = loadFileText(filename) soup = BeautifulStoneSoup(text) fileno = soup.find("docno") fileno = fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument = scidoc.SciDoc() newDocument["metadata"]["filename"] = os.path.basename(filename) newDocument["metadata"]["filepath"] = filename paper = soup.find("paper") if not paper: debugAddMessage(newDocument, "error", "NO <PAPER> IN THIS PAPER! file: " + filename) return newDocument # Load metadata, either from corpus or from file key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) if key in cp.Corpus.metadata_index: metadata = cp.Corpus.metadata_index[key] else: metadata = None if metadata: newDocument["metadata"]["conference"] = "" for field in metadata: newDocument["metadata"][field] = metadata[field] else: loadMetadata(newDocument, paper, fileno) debugAddMessage(newDocument, "error", "PAPER NOT IN METADATA FILE! file: " + filename) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID( newDocument["metadata"]) # Clean up potential weird text in XML metadata makeSureValuesAreReadable(newDocument) # Load all references from the XML for ref in soup.findAll("reference"): processReferenceXML(ref, newDocument) # Load Abstract abstract = soup.find("abstract") if not abstract: debugAddMessage(newDocument, "error", "CANNOT LOAD ABSTRACT! file: " + filename + "\n") # TODO: LOAD first paragraph as abstract else: newSection_id = newDocument.addSection("root", "Abstract") newPar_id = newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): newSent_id = newDocument.addSentence(newPar_id, s.text) loadAttributesIfPresent(s, ["ia", "az", "refid"], newDocument.element_by_id[newSent_id]) newDocument.abstract = newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument
{% if sense.examples|length > 0 %} <div> <ul style="" list-style-type: disc; ""> {% for example in sense.examples %} <li style="" text-align: left; ""><font color="" #999999 "">{{ example }}</font></li> {% endfor %} </ul> </div> {% else %} <div style="" text-align: left; ""> </div> {% endif %} {% endfor %} {% endfor %} </div> </div> ''' def __call__(self, word: dict) -> str: html = Template(self._tmpl).render(word=word) return html.replace('\n', '') if __name__ == '__main__': converter = AnkiHtmlConverter() for filename in glob.glob('htmls/*.html'): soup = BeautifulStoneSoup(open(filename)) if not soup.find(class_='ldoceEntry Entry'): continue word = os.path.basename(filename).rsplit('.', 1)[0] print(word, f'"{converter(parse(word, soup))}"', sep='\t')
def loadAZSciXML(filename): """ Load a Cambridge-style SciXML """ # main loadSciXML text = loadFileText(filename) soup = BeautifulStoneSoup(text) fileno = soup.find("docno") fileno = fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument = SciDoc() newDocument["metadata"]["filename"] = os.path.basename(filename) newDocument["metadata"]["filepath"] = filename paper = soup.find("paper") if not paper: debugAddMessage(newDocument, "error", "NO <PAPER> IN THIS PAPER! file: " + filename) return newDocument # Load metadata, either from corpus or from file ## key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) ## if cp.Corpus.metadata_index.has_key(key): ## metadata=cp.Corpus.metadata_index[key] ## else: metadata = None if metadata: newDocument["metadata"]["conference"] = "" for field in metadata: newDocument["metadata"][field] = metadata[field] else: loadMetadata(newDocument, paper, fileno, soup) ## debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID( newDocument["metadata"]) # Clean up potential weird text in XML metadata ## makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff # Load all references (at the end of the document) from the XML for ref in soup.findAll("reference"): processReferenceXML(ref, newDocument) # Load Abstract abstract = soup.find("abstract") if not abstract: debugAddMessage(newDocument, "error", "CANNOT LOAD ABSTRACT! file: " + filename + "\n") # TODO: LOAD first paragraph as abstract else: newSection_id = newDocument.addSection("root", "Abstract") newPar_id = newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): addNewSentenceAndProcessRefs( s, newDocument, newPar_id, newSection_id) # deals with all of the adding of a sentence newDocument.abstract = newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument
def read(self, filename, identifier): """ """ # main loadSciXML text = loadFileText(filename) soup = BeautifulStoneSoup(text) fileno = soup.find("docno") fileno = fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument = SciDoc() newDocument["metadata"]["filename"] = os.path.basename(filename) newDocument["metadata"]["filepath"] = filename paper = soup.find("paper") if not paper: debugAddMessage(newDocument, "error", "NO <PAPER> IN THIS PAPER! file: " + filename) return newDocument # Load metadata, either from corpus or from file key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) if key in cp.Corpus.metadata_index: metadata = cp.Corpus.metadata_index[key] else: metadata = None if metadata: newDocument["metadata"]["conference"] = "" for field in metadata: newDocument["metadata"][field] = metadata[field] else: self.loadMetadata(newDocument, paper, fileno) debugAddMessage(newDocument, "error", "PAPER NOT IN METADATA FILE! file: " + filename) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID( newDocument["metadata"]) # Clean up potential weird text in XML metadata self.makeSureValuesAreReadable(newDocument) # Load all references from the XML for ref in soup.findAll("reference"): self.processReferenceXML(ref, newDocument) # Load Abstract abstract = soup.find("abstract") if not abstract: debugAddMessage(newDocument, "error", "CANNOT LOAD ABSTRACT! file: " + filename + "\n") # TODO: LOAD first paragraph as abstract else: newSection_id = newDocument.addSection("root", "Abstract") newPar_id = newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): newSent_id = newDocument.addSentence(newPar_id, s.text) self.loadAttributesIfPresent( s, ["ia", "az", "refid"], newDocument.element_by_id[newSent_id]) newDocument.abstract = newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): self.loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference self.matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument
def loadAZannot(filename): """ Load an AZ-annotated document from the Teufel corpus into a "scidoc" JSON file """ def loadStructureProcessPara(p, glob): glob["p"]+=1 newPar={"type":"p", "id":glob["p"]} newPar["sentences"]=[] for s in p.findChildren("s"): newSent={"type":"s","text":s.text,"ia":s.get("ia",""),"az":s.get("az",""),"id":glob["s"],"refs":[]} newSent["refs"]=[{"text":r.text, "link":0} for r in s.findAll("ref")] glob["s"]+=1 newPar["sentences"].append(newSent) return newPar def loadStructureProcessDiv(div, doc, glob): header=div.find("header") newSection={"header":header, "paragraphs":[], "id":glob["sect"]} glob["sect"]+=1 for p in div.findAll("p"): newPar=loadStructureProcessPara(p,glob) newSection["paragraphs"].append(newPar) doc["sections"].append(newSection) glob={"sect":0,"p":0,"s":0} f=codecs.open(filename,"rb","utf-8", errors="ignore") lines=f.readlines() text="".join(lines) soup=BeautifulStoneSoup(text) paper=soup.find("paper") title=paper.find("title").text newDocument={"title":title} newDocument["sections"]=[] newDocument["references"]=[] newDocument["metadata"]={"fileno":paper.find("fileno").text} authors=[] meta=soup.find("metadata") for a in meta.findChildren("author"): authors.append(processPlainTextAuthor(a.text)) newDocument["authors"]=authors newDocument["year"]=meta.find("year").text for ref in soup.findAll("reference"): processReference(ref, newDocument) newSection={"header":"Abstract", "paragraphs":[], "id":glob["sect"]} glob["sect"]+=1 newSection["paragraphs"].append({"type":"p", "sentences":[], "id":glob["p"]}) glob["p"]+=1 abstract=soup.find("abstract") for s in abstract.findChildren("a-s"): newSent={"type":"s","text":s.text,"ia":s["ia"],"az":s["az"],"id":glob["s"], "refs":[]} newSection["paragraphs"][-1]["sentences"].append(newSent) glob["s"]+=1 newDocument["sections"].append(newSection) for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument, glob) sentences=getListOfSentenceObjects(newDocument) for s in sentences: for ref in s["refs"]: match=matchInTextReference(ref["text"],newDocument) if match: ## print ref["text"]," -> ", match["authors"], match["year"] ## print s.get("az","NO AZ") ## print s.get("ia","NO IA") azs.append(s.get("az","NO AZ")) ias.append(s.get("ia","NO IA")) match["AZ"]=match.get("AZ",[]) match["AZ"].append(s.get("az","OTH")) match["IA"]=match.get("AZ",[]) match["IA"].append(s.get("az","")) else: print("NO MATCH for CITATION in REFERENCES:", ref["text"]) pass ## "in press", "forthcoming", "submitted", "to appear" # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument