def open(self, book_id=None): if book_id: self.book_id = book_id if not self.book_id: raise Exception('Book id not set') self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id)) sz_mult = 1.0 / (1024**2) result = u'%.1f' % (self.size * sz_mult) self.size = u'<0.1' if result == u'0.0' else result self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id), 'r') soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml')) oebps = soup.findAll('rootfile')[0]['full-path'] folder = oebps.rfind(os.sep) self.oebps_folder = '' if folder == -1 else oebps[:folder + 1] # 找到oebps的文件夹名称 oebps_content = self.f.read(oebps) self.read_doc_props(oebps_content) opf_bs = BeautifulStoneSoup(oebps_content) ncx = opf_bs.findAll('item', {'id': 'ncx'})[0] ncx = self.oebps_folder + ncx['href'] # 找到ncx的完整路径 ncx_bs = BeautifulStoneSoup(self.f.read(ncx)) self.chapters = [ (nav.navlabel.text, nav.content['src']) for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint') ] self.cover_href = self.chapters[0][1] # 封面路径
class FollowThatTag(SoupTest): "Tests the various ways of fetching tags from a soup." def setUp(self): ml = Display.write(self) self.soup = BeautifulStoneSoup(ml) def testFindAllByName(self): matching = self.soup('https://stackoverflow.com/jobs/feed?l=Bridgewater%2c+MA%2c+United+States&u=Miles&d=50') self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') self.assertEqual(matching, self.soup.findAll('a')) self.assertEqual(matching, self.soup.findAll(SoupStrainer('a'))) def testFindAllText(self): soup = BeautifulSoup("<html>\xbb</html>", "lxml") self.assertEqual(soup.findAll(text=re.compile('.*')), [u'\xbb']) def testTextNavigation(self): soup = BeautifulSoup('<url>http://cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d</url><title>Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)</title>', "lxml") baz = soup.find(text='Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)') self.assertEquals(baz.findParent("url")['title'])
def loadJATSSentence(self, s, newDocument, par_id, section_id): """ Loads a JATS sentence (ready split) :param s: the plain text of the sentence (with all tags inside, e.g. <xref>) :param newDocument: SciDoc :param par_id: id of the paragraph containing this sentence :param section_id: id of the section containing the paragraph """ newSent = newDocument.addSentence(par_id, "") s_soup = BeautifulStoneSoup(s) refs = s_soup.findAll("xref", {"ref-type": "bibr"}) citations_found = [] for r in refs: citations_found.extend( self.loadJATSCitation(r, newSent["id"], newDocument, section=section_id)) non_refs = s_soup.findAll( lambda tag: tag.name.lower() == "xref" and "ref-type" in tag and tag["ref-type"].lower() != "bibr") for nr in non_refs: nr.name = "inref" newSent["citations"] = [acit["id"] for acit in citations_found] # TODO replace <xref> tags with <cit> tags newSent["text"] = newDocument.extractSentenceTextWithCitationTokens( s_soup, newSent["id"]) ## print(newSent["text"]) # deal with many citations within characters of each other: make them know they are a cluster # TODO cluster citations? Store them in some other way? newDocument.countMultiCitations(newSent)
def open(self, book_id=None): if book_id: self.book_id = book_id if not self.book_id: raise Exception('Book id not set') self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id)) sz_mult = 1.0/(1024**2) result = u'%.1f' % (self.size * sz_mult) self.size = u'<0.1' if result == u'0.0' else result self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id), 'r') soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml')) oebps = soup.findAll('rootfile')[0]['full-path'] folder = oebps.rfind(os.sep) self.oebps_folder = '' if folder == -1 else oebps[:folder+1] # 找到oebps的文件夹名称 oebps_content = self.f.read(oebps) self.read_doc_props(oebps_content) opf_bs = BeautifulStoneSoup(oebps_content) ncx = opf_bs.findAll('item', {'id': 'ncx'})[0] ncx = self.oebps_folder + ncx['href'] # 找到ncx的完整路径 ncx_bs = BeautifulStoneSoup(self.f.read(ncx)) self.chapters = [(nav.navlabel.text, nav.content['src']) for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint')] self.cover_href = self.chapters[0][1] # 封面路径
def extract_info_from_html(): from bs4 import BeautifulStoneSoup import re html = open("data/url.html").read() soup = BeautifulStoneSoup(html) inputTag = soup.findAll("a") inputTag = str(inputTag).split(",") m = [re.search(" +href=\"(.*?)\"", i) for i in inputTag] urls = [i.group(1) for i in m] code = [ i[9:-9].replace("<", "") for i in str(soup.findAll('strong')).split(",") ] city = [ i.split('<span class="uni-code">')[0].replace("\t", "").replace( "</span>", "").replace("\n", "") for i in html.split( '<i class="fa fa-map-marker" aria-hidden="true"></i>')[1:] ] abbr = [ i.split('</div>')[0].replace("\t", "").replace("</span>", "").replace("\n", "") for i in html.split('<div class="name-group">')[1::2] ] # ADD CODE TO UNI_INFO map_abbr_code = [{ "abbr": m, "code": n } for m, n in zip(abbr, code) if m != ""] import json uni = json.load(open("data/university.json")) len(uni) new_uni = [] abbrs = [] for i in uni: if (i["abbr"] in abbrs): continue else: for j in map_abbr_code: if (j["abbr"] == i["abbr"]): i["code"] = j["code"] break new_uni.append(i) abbrs.append(i["abbr"]) with open('data/university_add_code.json', 'w') as outfile: json.dump(new_uni, outfile, ensure_ascii=False, indent=4)
def _parse_request(self): """ Parses various parameters from _request_xml into _request_params. We need to override parse here as Microsoft Azure doesn't send AssertionConsumerServiceURL (ACS_URL) """ # Minimal test to verify that it's not binarily encoded still: if not self._request_xml.strip().startswith('<'): raise Exception('RequestXML is not valid XML; ' 'it may need to be decoded or decompressed.') soup = BeautifulStoneSoup(self._request_xml) request = soup.findAll()[0] if request.get('AssertionConsumerServiceURL', None): raise Exception( 'Invalid Azure request. AssertionConsumerServiceURL exists!') params = {} params['ACS_URL'] = AZURE_ACS_URL params['REQUEST_ID'] = request.get('id', request.get('ID')) params['REQUEST_ISSUER'] = self._get_request_issuer(request) params['DESTINATION'] = request.get('Destination', '') params['PROVIDER_NAME'] = request.get('ProviderName', '') self._request_params = params # Set subject format - overrides the value set in _reset() self._subject_format = AZURE_SUBJECT_FORMAT
def extract_tags_bs4(self): """ Using: BeatifulSoup's XML parser Returns XML data in dict format """ soup = Soup(self.query_xml) # XML as a string self.entries = soup.findAll('entry') # list of <entry>'s find_authors = lambda x: x.find('name').string for entry in self.entries: # strip down entry ID in url to (say) -> 'abs/math/0507289v1' entry_id = urlparse(entry.find('id').string).path.lstrip('/') title = entry.find('title').string summary = entry.find('summary').string # findAll() for multiple entries authors = entry.findAll('author') # returns list of data-type: BeautifulSoup.Tag # PYLINT chatters: authors = map(self.find_authors, authors) # using list comprehension instead authors = [find_authors(i) for i in authors] published = entry.find('published').string meta = { 'title': title, 'summary': summary, \ 'authors': authors, 'published': published } self.data[entry_id] = meta return self.data # python dict
def analyze_site_map(): r = requests.get('{}{}sitemap.xml'.format(app.config['WEB_PROTOCOL'], app.config['DOMAIN'])) soup = Soup(r.content) locs = soup.findAll('loc') return [loc.string for loc in locs]
def getSeriesDetailsByName(self, serieName): if serieName in IGNORE_SHOWS: return None print 'checking: ' + serieName if serieName in KNOWN_SHOWS.keys(): url = GET_SERIES_URL % (urllib.quote( KNOWN_SHOWS[serieName]['TVDBName'])) else: url = GET_SERIES_URL % (urllib.quote(serieName)) try: # Change the User Agent USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) req = urllib2.Request(url) req.add_header('User-Agent', USER_AGENT) resp = opener.open(req) soup = BeautifulStoneSoup(resp.read()) resp.close() if len(soup.findAll('series')) == 1: self.saveSerieDetail(serieName, soup.series) else: for serie in soup.findAll('series'): if serie.seriesname.string == serieName: self.saveSerieDetail(serieName, serie) if serieName in KNOWN_SHOWS.keys(): return KNOWN_SHOWS[serieName] return None except: print 'Error: ' + url return None
def parse_data(self, url): '''Собирает данные в словарь''' request = self.session.get(url, headers=self.headers) if request.status_code == 200: soup = BeautifulStoneSoup(request.content) if not (bool(soup.find('div', {"class": 'error404__text'})) or bool(soup.find('div', {"class": 'nothing-search'})) or bool(soup.find('div', {"id": 'productList'}))): try: name_of_product = soup.find('h1').next_element except Exception: raise Format_Exeption('name', url) try: price_for_all = soup.find( 'span', { "class": "item__price item__price--normal-left" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_all = "Нет в наличии" try: price_for_registered = soup.find( 'span', { "class": "item__price item__price--red-bold" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_registered = "Нет в наличии" try: reference = soup.findAll( 'div', {"class": "item__card-info-articul"}) reference = reference[1].next_element reference = str(reference).split()[2].replace("-", '') except Exception: reference = "Нет номера" final = { "name_of_product": name_of_product, "price_for_all": price_for_all, "price_for_registered": price_for_registered, "reference": reference, "url": url } return final else: print("Не тот формат, вот ссылка {0}".format(url)) raise Format_Exeption else: raise Connection_Exception
def read(self, xml, filename): """ Load a document from the Athar corpus Args: xml: full xml string """ ## # this solves a "bug" in BeautifulStoneSoup with "sec" tags ## BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] soup = BeautifulStoneSoup(xml) paper_data_node = soup.find("div", {"class": "dstPaperData"}) paper_data = { "id": paper_data_node.text, "title": "", "authors": "", } title = paper_data_node.find("div", {"class": "dstPaperTitle"}) if title: paper_data["title"] = title.text authors = paper_data_node.find("div", {"class": "dstPaperAuthors"}) if authors: author_chunks = title.text.split(";") for author in author_chunks: chunks = author.split(",") author_dict = {"given": chunks[1], "family": chunks[0]} paper_data["authors"] = author_dict ## print(paper_data) all_contexts = [] all_docs = [] document_nodes = soup.findAll("table", {"class": "srcPaper"}) for index, document_node in enumerate(document_nodes): try: doc, contexts = self.loadDocumentNode(document_node, paper_data, index) all_docs.append(doc) all_contexts.extend(contexts) except ValueError: print("Error:", sys.exc_info()[1]) break return all_docs, all_contexts
def parse(self): ''' Метод формирует JSON список url с рецептами сайта и сохраняет его в MEDIA_ROOT/parser/source.js. В зависимости от настроек анализирует карту сайта или же парсит html-страницы. ''' # Парсинг по карте сайта if hasattr(settings, 'PARSER__URL_SOURCE') and settings.PARSER__URL_SOURCE == 'sitemap': xml = None if not hasattr(settings, 'PARSER__SITEMAP_URL') or not settings.PARSER__SITEMAP_URL: print('PARSER__SITEMAP_URL is not defined') else: try: with urllib.request.urlopen(settings.PARSER__SITEMAP_URL) as response: xml = response.read() except Exception: xml = None if xml: sitemap = Soup(xml) urls = sitemap.findAll('url') for u in urls: loc = u.find('loc').string self._add_location(loc) else: # Парсинг по тегам html-страниц if not hasattr(settings, 'PARSER__CELL_HOMEPAGE') or not settings.PARSER__CELL_HOMEPAGE: print('PARSER__CELL_HOMEPAGE is not defined') return False # Счетчик рекурсивных вызовов метода _parse_html self._recursion_counter = 0 self._parse_html(settings.PARSER__CELL_HOMEPAGE) self._save() return self.json_file_path
def _parse_request(self): """ Parses various parameters from _request_xml into _request_params. """ # Minimal test to verify that it's not binarily encoded still: if isinstance(self._request_xml, bytes): request_xml = self._request_xml.decode('utf-8') else: request_xml = self._request_xml if not request_xml.strip().startswith('<'): raise Exception('RequestXML is not valid XML; ' 'it may need to be decoded or decompressed.') soup = BeautifulStoneSoup(self._request_xml) request = soup.findAll()[0] params = {} params['ACS_URL'] = request.get('AssertionConsumerServiceURL') params['REQUEST_ID'] = request.get('id', request.get('ID')) params['DESTINATION'] = request.get('Destination', '') params['PROVIDER_NAME'] = request.get('ProviderName', '') self._request_params = params
def processCitationXML(intext): """ Extract the authors, date of an in-text citation <ref> from XML dom """ if isinstance(intext, six.string_types): xml = BeautifulStoneSoup(intext) else: xml = intext if not xml: return None, None authors = [] for a in xml.findAll("refauthor"): authors.append(a.text) date = xml.find("date") if date: date = cleanxml(date.__repr__()) else: date = "" if authors == [] or date == "": return None, None else: return authors, date
#-*- coding: utf-8 -*- import MySQLdb from bs4 import BeautifulStoneSoup db = MySQLdb.connect('localhost', 'root', '80671551192', 'test') cursor = db.cursor() xml_cinema = open('dumps/cinema.xml') soup = BeautifulStoneSoup(xml_cinema) for i in soup.findAll('cinema'): id = int(i['id']) cinema = i['name'].encode('utf-8') city_id = int(i['id']) cinema_circuit_id = '' street_type_id = '' street_name = '' number_housing = '' number_hous = '' letter_housing = '' try: zip = int(i.zip['value']) except ValueError: zip = 0 opening = '' note = '' code = '' coding = "SET NAMES 'utf8'" cursor.execute(coding)
def loadAZannot(filename): """ Load an AZ-annotated document from the Teufel corpus into a "scidoc" JSON file """ def loadStructureProcessPara(p, glob): glob["p"]+=1 newPar={"type":"p", "id":glob["p"]} newPar["sentences"]=[] for s in p.findChildren("s"): newSent={"type":"s","text":s.text,"ia":s.get("ia",""),"az":s.get("az",""),"id":glob["s"],"refs":[]} newSent["refs"]=[{"text":r.text, "link":0} for r in s.findAll("ref")] glob["s"]+=1 newPar["sentences"].append(newSent) return newPar def loadStructureProcessDiv(div, doc, glob): header=div.find("header") newSection={"header":header, "paragraphs":[], "id":glob["sect"]} glob["sect"]+=1 for p in div.findAll("p"): newPar=loadStructureProcessPara(p,glob) newSection["paragraphs"].append(newPar) doc["sections"].append(newSection) glob={"sect":0,"p":0,"s":0} f=codecs.open(filename,"rb","utf-8", errors="ignore") lines=f.readlines() text="".join(lines) soup=BeautifulStoneSoup(text) paper=soup.find("paper") title=paper.find("title").text newDocument={"title":title} newDocument["sections"]=[] newDocument["references"]=[] newDocument["metadata"]={"fileno":paper.find("fileno").text} authors=[] meta=soup.find("metadata") for a in meta.findChildren("author"): authors.append(processPlainTextAuthor(a.text)) newDocument["authors"]=authors newDocument["year"]=meta.find("year").text for ref in soup.findAll("reference"): processReference(ref, newDocument) newSection={"header":"Abstract", "paragraphs":[], "id":glob["sect"]} glob["sect"]+=1 newSection["paragraphs"].append({"type":"p", "sentences":[], "id":glob["p"]}) glob["p"]+=1 abstract=soup.find("abstract") for s in abstract.findChildren("a-s"): newSent={"type":"s","text":s.text,"ia":s["ia"],"az":s["az"],"id":glob["s"], "refs":[]} newSection["paragraphs"][-1]["sentences"].append(newSent) glob["s"]+=1 newDocument["sections"].append(newSection) for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument, glob) sentences=getListOfSentenceObjects(newDocument) for s in sentences: for ref in s["refs"]: match=matchInTextReference(ref["text"],newDocument) if match: ## print ref["text"]," -> ", match["authors"], match["year"] ## print s.get("az","NO AZ") ## print s.get("ia","NO IA") azs.append(s.get("az","NO AZ")) ias.append(s.get("ia","NO IA")) match["AZ"]=match.get("AZ",[]) match["AZ"].append(s.get("az","OTH")) match["IA"]=match.get("AZ",[]) match["IA"].append(s.get("az","")) else: print("NO MATCH for CITATION in REFERENCES:", ref["text"]) pass ## "in press", "forthcoming", "submitted", "to appear" # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument
def read(self, filename, identifier): """ """ # main loadSciXML text = loadFileText(filename) soup = BeautifulStoneSoup(text) fileno = soup.find("docno") fileno = fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument = SciDoc() newDocument["metadata"]["filename"] = os.path.basename(filename) newDocument["metadata"]["filepath"] = filename paper = soup.find("paper") if not paper: debugAddMessage(newDocument, "error", "NO <PAPER> IN THIS PAPER! file: " + filename) return newDocument # Load metadata, either from corpus or from file key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) if key in cp.Corpus.metadata_index: metadata = cp.Corpus.metadata_index[key] else: metadata = None if metadata: newDocument["metadata"]["conference"] = "" for field in metadata: newDocument["metadata"][field] = metadata[field] else: self.loadMetadata(newDocument, paper, fileno) debugAddMessage(newDocument, "error", "PAPER NOT IN METADATA FILE! file: " + filename) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID( newDocument["metadata"]) # Clean up potential weird text in XML metadata self.makeSureValuesAreReadable(newDocument) # Load all references from the XML for ref in soup.findAll("reference"): self.processReferenceXML(ref, newDocument) # Load Abstract abstract = soup.find("abstract") if not abstract: debugAddMessage(newDocument, "error", "CANNOT LOAD ABSTRACT! file: " + filename + "\n") # TODO: LOAD first paragraph as abstract else: newSection_id = newDocument.addSection("root", "Abstract") newPar_id = newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): newSent_id = newDocument.addSentence(newPar_id, s.text) self.loadAttributesIfPresent( s, ["ia", "az", "refid"], newDocument.element_by_id[newSent_id]) newDocument.abstract = newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): self.loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference self.matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument
def loadSciXML(filename): """ Load a Cambridge-style SciXML """ def extractSentenceText(s, newSent_id, doc): """ Returns a printable representation of the sentence where all references are now placeholders with numbers """ global ref_rep_count ref_rep_count = 0 newSent = doc.element_by_id[newSent_id] def repFunc(match): """ """ global ref_rep_count ref_rep_count += 1 res = " <CIT ID=" + str( doc.citation_by_id[newSent["citations"][ref_rep_count - 1]]["id"]) + " />" return res text = s.renderContents() text = re.sub(r"<ref.*?</ref>", repFunc, text, 0, re.IGNORECASE | re.DOTALL) text = re.sub(r"</?refauthor>", "", text, 0, re.IGNORECASE | re.DOTALL) return text def loadStructureProcessPara(p, newDocument, parent): newPar_id = newDocument.addParagraph(parent) for s in p.findChildren("s"): newSent_id = newDocument.addSentence(newPar_id, "") newSent = newDocument.element_by_id[newSent_id] loadAttributesIfPresent(s, ["ia", "az", "refid"], newSent) refs = s.findAll("ref") num = len(newDocument["citations"]) + 1 ## for cit in citations: ## r["citation_id"]=num ## num+=1 loaded_refs = [ loadCitation(r, newSent_id, newDocument, parent) for r in refs ] newSent["citations"] = [aref["id"] for aref in loaded_refs] newSent["text"] = extractSentenceText(s, newSent_id, newDocument) newDocument.countMultiCitations( newSent ) # deal with many citations within characters of each other: make them know they are a cluster TODO cluster them return newPar_id def loadStructureProcessDiv(div, newDocument): header = div.find("header") if not header: header_id = 0 header_text = "" else: header_id = header["id"] or 0 header_text = re.sub(r"</?header.*?>", "", header.__repr__()) newSection_id = newDocument.addSection("root", header_text, header_id) for p in div.findAll("p"): newPar_id = loadStructureProcessPara(p, newDocument, newSection_id) def loadMetadataIfExists(branch, key, doc): meta = branch.find(key) if meta: doc["metadata"][key] = meta.text def loadAttributesIfPresent(branch, attributes, sent): """ For each element in attributes, if present in branch, it is added to sent """ for a in attributes: if a in branch: sent[a] = branch[a] def loadMetadata(newDocument, paper, fileno): """ Does all the painful stuff of trying to recover metadata from inside a badly converted SciXML file """ title = paper.findChildren("title") newDocument["metadata"]["title"] = title[0].text if len( title) > 0 else "NO TITLE" if fileno == "": fileno = paper.find("fileno").text newDocument["metadata"]["fileno"] = fileno authors = [] meta = soup.find("metadata") if not meta: debugAddMessage(newDocument, "error", "NO METADATA IN DOCUMENT! file:" + filename) return newDocument for a in meta.findChildren("author"): authors.append(processPlainTextAuthor(a.text)) if authors == []: authorlist = soup.find("authorlist") if authorlist: for author in authorlist.findChildren("refauthor"): authors.append(author.text) if authors == []: authors = extractAuthorsFromAuthorlist(authorlist) appeared = meta.find("appeared") if appeared: loadMetadataIfExists(appeared, "conference", newDocument) loadMetadataIfExists(appeared, "year", newDocument) newDocument["metadata"]["authors"] = authors newDocument["metadata"]["year"] = meta.find("year").text def sanitizeString(s, maxlen=200): s = s.replace("\t", " ") s = s[:maxlen] return s def makeSureValuesAreReadable(newDocument): newDocument["metadata"]["title"] = sanitizeString( newDocument["metadata"]["title"]) newAuthors = [] for author in newDocument["metadata"]["authors"]: newAuthors.append(sanitizeString(author, 70)) newDocument["metadata"]["authors"] = newAuthors newSurnames = [] for surname in newDocument["metadata"]["surnames"]: newSurnames.append(sanitizeString(surname, 25)) newDocument["metadata"]["surnames"] = newSurnames newDocument["metadata"]["year"] = sanitizeString( newDocument["metadata"]["year"]) if "conference" in newDocument["metadata"]: newDocument["metadata"]["conference"] = sanitizeString( newDocument["metadata"]["conference"]) def matchCitationsWithReferences(newDocument): """ Match each citation with its reference """ allcitations = [] for s in newDocument.allsentences: for citation_id in s["citations"]: cit = newDocument.citation_by_id[citation_id] if cit["ref_id"] != 0: # the citation already has a matching reference id in the original document, use it match = findMatchingReferenceByOriginalId( cit["ref_id"], newDocument) if not match: ## print cit match = newDocument.matchReferenceById(cit["ref_id"]) else: # attempt to guess which reference the citation should point to match = matchCitationWithReference(cit["original_text"], newDocument) if match: # whatever the previous case, make sure citation points to the ID of its reference cit["ref_id"] = match["id"] match["citations"].append( cit["id"] ) # add the citation ID to the reference's list of citations cit.pop("authors", "") cit.pop("date", "") cit.pop("original_text", "") else: debugAddMessage( newDocument, "notes", "NO MATCH for CITATION in REFERENCES: " + cleanxml(cit["original_text"]) + ", ") pass # main loadSciXML text = loadFileText(filename) soup = BeautifulStoneSoup(text) fileno = soup.find("docno") fileno = fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument = scidoc.SciDoc() newDocument["metadata"]["filename"] = os.path.basename(filename) newDocument["metadata"]["filepath"] = filename paper = soup.find("paper") if not paper: debugAddMessage(newDocument, "error", "NO <PAPER> IN THIS PAPER! file: " + filename) return newDocument # Load metadata, either from corpus or from file key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) if key in cp.Corpus.metadata_index: metadata = cp.Corpus.metadata_index[key] else: metadata = None if metadata: newDocument["metadata"]["conference"] = "" for field in metadata: newDocument["metadata"][field] = metadata[field] else: loadMetadata(newDocument, paper, fileno) debugAddMessage(newDocument, "error", "PAPER NOT IN METADATA FILE! file: " + filename) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID( newDocument["metadata"]) # Clean up potential weird text in XML metadata makeSureValuesAreReadable(newDocument) # Load all references from the XML for ref in soup.findAll("reference"): processReferenceXML(ref, newDocument) # Load Abstract abstract = soup.find("abstract") if not abstract: debugAddMessage(newDocument, "error", "CANNOT LOAD ABSTRACT! file: " + filename + "\n") # TODO: LOAD first paragraph as abstract else: newSection_id = newDocument.addSection("root", "Abstract") newPar_id = newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): newSent_id = newDocument.addSentence(newPar_id, s.text) loadAttributesIfPresent(s, ["ia", "az", "refid"], newDocument.element_by_id[newSent_id]) newDocument.abstract = newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument
def loadAZSciXML(filename): """ Load a Cambridge-style SciXML """ # main loadSciXML text = loadFileText(filename) soup = BeautifulStoneSoup(text) fileno = soup.find("docno") fileno = fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument = SciDoc() newDocument["metadata"]["filename"] = os.path.basename(filename) newDocument["metadata"]["filepath"] = filename paper = soup.find("paper") if not paper: debugAddMessage(newDocument, "error", "NO <PAPER> IN THIS PAPER! file: " + filename) return newDocument # Load metadata, either from corpus or from file ## key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) ## if cp.Corpus.metadata_index.has_key(key): ## metadata=cp.Corpus.metadata_index[key] ## else: metadata = None if metadata: newDocument["metadata"]["conference"] = "" for field in metadata: newDocument["metadata"][field] = metadata[field] else: loadMetadata(newDocument, paper, fileno, soup) ## debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID( newDocument["metadata"]) # Clean up potential weird text in XML metadata ## makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff # Load all references (at the end of the document) from the XML for ref in soup.findAll("reference"): processReferenceXML(ref, newDocument) # Load Abstract abstract = soup.find("abstract") if not abstract: debugAddMessage(newDocument, "error", "CANNOT LOAD ABSTRACT! file: " + filename + "\n") # TODO: LOAD first paragraph as abstract else: newSection_id = newDocument.addSection("root", "Abstract") newPar_id = newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): addNewSentenceAndProcessRefs( s, newDocument, newPar_id, newSection_id) # deals with all of the adding of a sentence newDocument.abstract = newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument