示例#1
0
    def open(self, book_id=None):
        if book_id:
            self.book_id = book_id
        if not self.book_id:
            raise Exception('Book id not set')

        self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id))
        sz_mult = 1.0 / (1024**2)
        result = u'%.1f' % (self.size * sz_mult)
        self.size = u'<0.1' if result == u'0.0' else result

        self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id),
                                 'r')
        soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml'))

        oebps = soup.findAll('rootfile')[0]['full-path']
        folder = oebps.rfind(os.sep)
        self.oebps_folder = '' if folder == -1 else oebps[:folder +
                                                          1]  # 找到oebps的文件夹名称

        oebps_content = self.f.read(oebps)
        self.read_doc_props(oebps_content)

        opf_bs = BeautifulStoneSoup(oebps_content)
        ncx = opf_bs.findAll('item', {'id': 'ncx'})[0]
        ncx = self.oebps_folder + ncx['href']  # 找到ncx的完整路径

        ncx_bs = BeautifulStoneSoup(self.f.read(ncx))

        self.chapters = [
            (nav.navlabel.text, nav.content['src'])
            for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint')
        ]
        self.cover_href = self.chapters[0][1]  # 封面路径
示例#2
0
class FollowThatTag(SoupTest):
    "Tests the various ways of fetching tags from a soup."

    def setUp(self):



        ml = Display.write(self)
        self.soup = BeautifulStoneSoup(ml)

    def testFindAllByName(self):
        matching = self.soup('https://stackoverflow.com/jobs/feed?l=Bridgewater%2c+MA%2c+United+States&u=Miles&d=50')
        self.assertEqual(len(matching), 2)
        self.assertEqual(matching[0].name, 'a')
        self.assertEqual(matching, self.soup.findAll('a'))
        self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))



    def testFindAllText(self):
        soup = BeautifulSoup("<html>\xbb</html>", "lxml")
        self.assertEqual(soup.findAll(text=re.compile('.*')),
                         [u'\xbb'])

    def testTextNavigation(self):
        soup = BeautifulSoup('<url>http://cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d</url><title>Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)</title>', "lxml")
        baz = soup.find(text='Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)')
        self.assertEquals(baz.findParent("url")['title'])
示例#3
0
    def loadJATSSentence(self, s, newDocument, par_id, section_id):
        """
            Loads a JATS sentence (ready split)

            :param s: the plain text of the sentence (with all tags inside, e.g. <xref>)
            :param newDocument: SciDoc
            :param par_id: id of the paragraph containing this sentence
            :param section_id: id of the section containing the paragraph
        """
        newSent = newDocument.addSentence(par_id, "")
        s_soup = BeautifulStoneSoup(s)

        refs = s_soup.findAll("xref", {"ref-type": "bibr"})
        citations_found = []
        for r in refs:
            citations_found.extend(
                self.loadJATSCitation(r,
                                      newSent["id"],
                                      newDocument,
                                      section=section_id))

        non_refs = s_soup.findAll(
            lambda tag: tag.name.lower() == "xref" and "ref-type" in tag and
            tag["ref-type"].lower() != "bibr")
        for nr in non_refs:
            nr.name = "inref"

        newSent["citations"] = [acit["id"] for acit in citations_found]
        # TODO replace <xref> tags with <cit> tags
        newSent["text"] = newDocument.extractSentenceTextWithCitationTokens(
            s_soup, newSent["id"])
        ##            print(newSent["text"])
        # deal with many citations within characters of each other: make them know they are a cluster
        # TODO cluster citations? Store them in some other way?
        newDocument.countMultiCitations(newSent)
示例#4
0
文件: books.py 项目: bindx/EE-Book
    def open(self, book_id=None):
        if book_id:
            self.book_id = book_id
        if not self.book_id:
            raise Exception('Book id not set')

        self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id))
        sz_mult = 1.0/(1024**2)
        result = u'%.1f' % (self.size * sz_mult)
        self.size = u'<0.1' if result == u'0.0' else result

        self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id), 'r')
        soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml'))

        oebps = soup.findAll('rootfile')[0]['full-path']
        folder = oebps.rfind(os.sep)
        self.oebps_folder = '' if folder == -1 else oebps[:folder+1]   # 找到oebps的文件夹名称

        oebps_content = self.f.read(oebps)
        self.read_doc_props(oebps_content)

        opf_bs = BeautifulStoneSoup(oebps_content)
        ncx = opf_bs.findAll('item', {'id': 'ncx'})[0]
        ncx = self.oebps_folder + ncx['href']     # 找到ncx的完整路径

        ncx_bs = BeautifulStoneSoup(self.f.read(ncx))

        self.chapters = [(nav.navlabel.text, nav.content['src']) for
                         nav in ncx_bs.findAll('navmap')[0].findAll('navpoint')]
        self.cover_href = self.chapters[0][1]    # 封面路径
示例#5
0
def extract_info_from_html():
    from bs4 import BeautifulStoneSoup
    import re
    html = open("data/url.html").read()
    soup = BeautifulStoneSoup(html)
    inputTag = soup.findAll("a")
    inputTag = str(inputTag).split(",")
    m = [re.search(" +href=\"(.*?)\"", i) for i in inputTag]
    urls = [i.group(1) for i in m]

    code = [
        i[9:-9].replace("<", "")
        for i in str(soup.findAll('strong')).split(",")
    ]
    city = [
        i.split('<span class="uni-code">')[0].replace("\t", "").replace(
            "</span>", "").replace("\n", "") for i in html.split(
                '<i class="fa fa-map-marker" aria-hidden="true"></i>')[1:]
    ]
    abbr = [
        i.split('</div>')[0].replace("\t", "").replace("</span>",
                                                       "").replace("\n", "")
        for i in html.split('<div class="name-group">')[1::2]
    ]

    # ADD CODE TO UNI_INFO
    map_abbr_code = [{
        "abbr": m,
        "code": n
    } for m, n in zip(abbr, code) if m != ""]
    import json
    uni = json.load(open("data/university.json"))

    len(uni)
    new_uni = []
    abbrs = []
    for i in uni:
        if (i["abbr"] in abbrs):
            continue
        else:
            for j in map_abbr_code:
                if (j["abbr"] == i["abbr"]):
                    i["code"] = j["code"]
                    break
            new_uni.append(i)
            abbrs.append(i["abbr"])

    with open('data/university_add_code.json', 'w') as outfile:
        json.dump(new_uni, outfile, ensure_ascii=False, indent=4)
示例#6
0
    def _parse_request(self):
        """
        Parses various parameters from _request_xml into _request_params.
        We need to override parse here as Microsoft Azure doesn't send
        AssertionConsumerServiceURL (ACS_URL)
        """
        # Minimal test to verify that it's not binarily encoded still:
        if not self._request_xml.strip().startswith('<'):
            raise Exception('RequestXML is not valid XML; '
                            'it may need to be decoded or decompressed.')

        soup = BeautifulStoneSoup(self._request_xml)
        request = soup.findAll()[0]

        if request.get('AssertionConsumerServiceURL', None):
            raise Exception(
                'Invalid Azure request. AssertionConsumerServiceURL exists!')

        params = {}
        params['ACS_URL'] = AZURE_ACS_URL
        params['REQUEST_ID'] = request.get('id', request.get('ID'))

        params['REQUEST_ISSUER'] = self._get_request_issuer(request)

        params['DESTINATION'] = request.get('Destination', '')
        params['PROVIDER_NAME'] = request.get('ProviderName', '')

        self._request_params = params

        # Set subject format - overrides the value set in _reset()
        self._subject_format = AZURE_SUBJECT_FORMAT
示例#7
0
    def extract_tags_bs4(self):
        """
        Using: BeatifulSoup's XML parser
        Returns XML data in dict format 
        """
        soup = Soup(self.query_xml) # XML as a string
        self.entries = soup.findAll('entry') # list of <entry>'s
        find_authors = lambda x: x.find('name').string
        for entry in self.entries:
            # strip down entry ID in url to (say) -> 'abs/math/0507289v1'
            entry_id = urlparse(entry.find('id').string).path.lstrip('/') 
            title = entry.find('title').string
            summary = entry.find('summary').string
            # findAll() for multiple entries 
            authors = entry.findAll('author') 
            # returns list of data-type: BeautifulSoup.Tag
            # PYLINT chatters: authors = map(self.find_authors, authors)
            # using list comprehension instead
            authors = [find_authors(i) for i in authors]

            published = entry.find('published').string
            meta = { 'title': title, 'summary': summary, \
                  'authors': authors, 'published': published }
            self.data[entry_id] = meta

        return self.data # python dict
示例#8
0
def analyze_site_map():
    r = requests.get('{}{}sitemap.xml'.format(app.config['WEB_PROTOCOL'],
                                              app.config['DOMAIN']))

    soup = Soup(r.content)
    locs = soup.findAll('loc')
    return [loc.string for loc in locs]
示例#9
0
    def getSeriesDetailsByName(self, serieName):

        if serieName in IGNORE_SHOWS:
            return None

        print 'checking: ' + serieName

        if serieName in KNOWN_SHOWS.keys():
            url = GET_SERIES_URL % (urllib.quote(
                KNOWN_SHOWS[serieName]['TVDBName']))
        else:
            url = GET_SERIES_URL % (urllib.quote(serieName))

        try:
            # Change the User Agent
            USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10'

            cj = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

            req = urllib2.Request(url)
            req.add_header('User-Agent', USER_AGENT)

            resp = opener.open(req)

            soup = BeautifulStoneSoup(resp.read())
            resp.close()

            if len(soup.findAll('series')) == 1:
                self.saveSerieDetail(serieName, soup.series)
            else:
                for serie in soup.findAll('series'):
                    if serie.seriesname.string == serieName:
                        self.saveSerieDetail(serieName, serie)

            if serieName in KNOWN_SHOWS.keys():
                return KNOWN_SHOWS[serieName]
            return None
        except:
            print 'Error: ' + url
            return None
示例#10
0
    def parse_data(self, url):
        '''Собирает данные в словарь'''
        request = self.session.get(url, headers=self.headers)
        if request.status_code == 200:
            soup = BeautifulStoneSoup(request.content)
            if not (bool(soup.find('div', {"class": 'error404__text'}))
                    or bool(soup.find('div', {"class": 'nothing-search'}))
                    or bool(soup.find('div', {"id": 'productList'}))):

                try:
                    name_of_product = soup.find('h1').next_element
                except Exception:
                    raise Format_Exeption('name', url)

                try:
                    price_for_all = soup.find(
                        'span', {
                            "class": "item__price item__price--normal-left"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_all = "Нет в наличии"
                try:
                    price_for_registered = soup.find(
                        'span', {
                            "class": "item__price item__price--red-bold"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_registered = "Нет в наличии"

                try:
                    reference = soup.findAll(
                        'div', {"class": "item__card-info-articul"})
                    reference = reference[1].next_element
                    reference = str(reference).split()[2].replace("-", '')
                except Exception:
                    reference = "Нет номера"
                final = {
                    "name_of_product": name_of_product,
                    "price_for_all": price_for_all,
                    "price_for_registered": price_for_registered,
                    "reference": reference,
                    "url": url
                }
                return final
            else:
                print("Не тот формат, вот ссылка {0}".format(url))
                raise Format_Exeption
        else:
            raise Connection_Exception
示例#11
0
    def read(self, xml, filename):
        """
            Load a document from the Athar corpus

            Args:
                xml: full xml string
        """
        ##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        ##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        soup = BeautifulStoneSoup(xml)

        paper_data_node = soup.find("div", {"class": "dstPaperData"})
        paper_data = {
            "id": paper_data_node.text,
            "title": "",
            "authors": "",
        }
        title = paper_data_node.find("div", {"class": "dstPaperTitle"})
        if title:
            paper_data["title"] = title.text

        authors = paper_data_node.find("div", {"class": "dstPaperAuthors"})
        if authors:
            author_chunks = title.text.split(";")
            for author in author_chunks:
                chunks = author.split(",")
                author_dict = {"given": chunks[1], "family": chunks[0]}
            paper_data["authors"] = author_dict


##        print(paper_data)

        all_contexts = []
        all_docs = []
        document_nodes = soup.findAll("table", {"class": "srcPaper"})
        for index, document_node in enumerate(document_nodes):
            try:
                doc, contexts = self.loadDocumentNode(document_node,
                                                      paper_data, index)
                all_docs.append(doc)
                all_contexts.extend(contexts)
            except ValueError:
                print("Error:", sys.exc_info()[1])
                break
        return all_docs, all_contexts
示例#12
0
    def parse(self):
        '''
        Метод формирует JSON список url с рецептами сайта и
        сохраняет его в MEDIA_ROOT/parser/source.js.
        В зависимости от настроек анализирует карту сайта или же
        парсит html-страницы.

        '''

        # Парсинг по карте сайта
        if hasattr(settings, 'PARSER__URL_SOURCE') and settings.PARSER__URL_SOURCE == 'sitemap':

            xml = None

            if not hasattr(settings, 'PARSER__SITEMAP_URL') or not settings.PARSER__SITEMAP_URL:
                print('PARSER__SITEMAP_URL is not defined')
            else:
                try:
                    with urllib.request.urlopen(settings.PARSER__SITEMAP_URL) as response:
                        xml = response.read()
                except Exception:
                    xml = None

            if xml:
                sitemap = Soup(xml)
                urls = sitemap.findAll('url')
                for u in urls:
                    loc = u.find('loc').string
                    self._add_location(loc)
        else:
            # Парсинг по тегам html-страниц
            if not hasattr(settings, 'PARSER__CELL_HOMEPAGE') or not settings.PARSER__CELL_HOMEPAGE:
                print('PARSER__CELL_HOMEPAGE is not defined')
                return False

            # Счетчик рекурсивных вызовов метода _parse_html
            self._recursion_counter = 0

            self._parse_html(settings.PARSER__CELL_HOMEPAGE)

        self._save()

        return self.json_file_path
示例#13
0
    def _parse_request(self):
        """
        Parses various parameters from _request_xml into _request_params.
        """
        # Minimal test to verify that it's not binarily encoded still:
        if isinstance(self._request_xml, bytes):
            request_xml = self._request_xml.decode('utf-8')
        else:
            request_xml = self._request_xml
        if not request_xml.strip().startswith('<'):
            raise Exception('RequestXML is not valid XML; '
                            'it may need to be decoded or decompressed.')

        soup = BeautifulStoneSoup(self._request_xml)
        request = soup.findAll()[0]
        params = {}
        params['ACS_URL'] = request.get('AssertionConsumerServiceURL')
        params['REQUEST_ID'] = request.get('id', request.get('ID'))
        params['DESTINATION'] = request.get('Destination', '')
        params['PROVIDER_NAME'] = request.get('ProviderName', '')
        self._request_params = params
示例#14
0
def processCitationXML(intext):
    """
        Extract the authors, date of an in-text citation <ref> from XML dom
    """
    if isinstance(intext, six.string_types):
        xml = BeautifulStoneSoup(intext)
    else:
        xml = intext

    if not xml:
        return None, None
    authors = []
    for a in xml.findAll("refauthor"):
        authors.append(a.text)
    date = xml.find("date")
    if date:
        date = cleanxml(date.__repr__())
    else:
        date = ""

    if authors == [] or date == "":
        return None, None
    else:
        return authors, date
示例#15
0
#-*- coding: utf-8 -*-

import MySQLdb

from bs4 import BeautifulStoneSoup

db = MySQLdb.connect('localhost', 'root', '80671551192', 'test')
cursor = db.cursor()
xml_cinema = open('dumps/cinema.xml')
soup = BeautifulStoneSoup(xml_cinema)

for i in soup.findAll('cinema'):
	id = int(i['id'])
	cinema = i['name'].encode('utf-8')
	city_id = int(i['id'])
	cinema_circuit_id = ''
	street_type_id = ''
	street_name = ''
	number_housing = ''
	number_hous = ''
	letter_housing = ''
	try:
		zip = int(i.zip['value'])
	except ValueError:
		zip = 0
	opening = ''
	note = ''
	code = ''

	coding = "SET NAMES 'utf8'"
	cursor.execute(coding)
示例#16
0
def loadAZannot(filename):
	"""
		Load an AZ-annotated document from the Teufel corpus into a "scidoc" JSON file
	"""

	def loadStructureProcessPara(p, glob):
		glob["p"]+=1
		newPar={"type":"p", "id":glob["p"]}
		newPar["sentences"]=[]

		for s in p.findChildren("s"):
			newSent={"type":"s","text":s.text,"ia":s.get("ia",""),"az":s.get("az",""),"id":glob["s"],"refs":[]}
			newSent["refs"]=[{"text":r.text, "link":0} for r in s.findAll("ref")]
			glob["s"]+=1
			newPar["sentences"].append(newSent)

		return newPar

	def loadStructureProcessDiv(div, doc, glob):
		header=div.find("header")

		newSection={"header":header, "paragraphs":[], "id":glob["sect"]}
		glob["sect"]+=1
		for p in div.findAll("p"):
			newPar=loadStructureProcessPara(p,glob)
			newSection["paragraphs"].append(newPar)

		doc["sections"].append(newSection)

	glob={"sect":0,"p":0,"s":0}


	f=codecs.open(filename,"rb","utf-8", errors="ignore")
	lines=f.readlines()
	text="".join(lines)
	soup=BeautifulStoneSoup(text)

	paper=soup.find("paper")
	title=paper.find("title").text

	newDocument={"title":title}
	newDocument["sections"]=[]
	newDocument["references"]=[]
	newDocument["metadata"]={"fileno":paper.find("fileno").text}

	authors=[]
	meta=soup.find("metadata")
	for a in meta.findChildren("author"):
		authors.append(processPlainTextAuthor(a.text))

	newDocument["authors"]=authors
	newDocument["year"]=meta.find("year").text

	for ref in soup.findAll("reference"):
		processReference(ref, newDocument)

	newSection={"header":"Abstract", "paragraphs":[], "id":glob["sect"]}
	glob["sect"]+=1
	newSection["paragraphs"].append({"type":"p", "sentences":[], "id":glob["p"]})
	glob["p"]+=1

	abstract=soup.find("abstract")
	for s in abstract.findChildren("a-s"):
		newSent={"type":"s","text":s.text,"ia":s["ia"],"az":s["az"],"id":glob["s"], "refs":[]}
		newSection["paragraphs"][-1]["sentences"].append(newSent)
		glob["s"]+=1

	newDocument["sections"].append(newSection)

	for div in soup.findAll("div"):
		loadStructureProcessDiv(div, newDocument, glob)

	sentences=getListOfSentenceObjects(newDocument)
	for s in sentences:
		for ref in s["refs"]:
			match=matchInTextReference(ref["text"],newDocument)
			if match:
##				print ref["text"]," -> ", match["authors"], match["year"]
##				print s.get("az","NO AZ")
##				print s.get("ia","NO IA")
				azs.append(s.get("az","NO AZ"))
				ias.append(s.get("ia","NO IA"))
				match["AZ"]=match.get("AZ",[])
				match["AZ"].append(s.get("az","OTH"))
				match["IA"]=match.get("AZ",[])
				match["IA"].append(s.get("az",""))
			else:
				print("NO MATCH for CITATION in REFERENCES:", ref["text"])
				pass

## "in press", "forthcoming", "submitted", "to appear"
# No functiona por: unicode
##	for ref in newDocument["references"]:
##		k=ref.get("AZ",["NO AZ"])
##		print k, most_common(k)

	return newDocument
示例#17
0
    def read(self, filename, identifier):
        """
        """
        # main loadSciXML
        text = loadFileText(filename)
        soup = BeautifulStoneSoup(text)

        fileno = soup.find("docno")
        fileno = fileno.text if fileno else ""

        # Create a new SciDoc to store the paper
        newDocument = SciDoc()
        newDocument["metadata"]["filename"] = os.path.basename(filename)
        newDocument["metadata"]["filepath"] = filename

        paper = soup.find("paper")
        if not paper:
            debugAddMessage(newDocument, "error",
                            "NO <PAPER> IN THIS PAPER! file: " + filename)
            return newDocument

        # Load metadata, either from corpus or from file
        key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
        if key in cp.Corpus.metadata_index:
            metadata = cp.Corpus.metadata_index[key]
        else:
            metadata = None

        if metadata:
            newDocument["metadata"]["conference"] = ""
            for field in metadata:
                newDocument["metadata"][field] = metadata[field]
        else:
            self.loadMetadata(newDocument, paper, fileno)
            debugAddMessage(newDocument, "error",
                            "PAPER NOT IN METADATA FILE! file: " + filename)

        newDocument["metadata"]["guid"] = cp.Corpus.generateGUID(
            newDocument["metadata"])

        # Clean up potential weird text in XML metadata
        self.makeSureValuesAreReadable(newDocument)

        # Load all references from the XML
        for ref in soup.findAll("reference"):
            self.processReferenceXML(ref, newDocument)

        # Load Abstract
        abstract = soup.find("abstract")
        if not abstract:
            debugAddMessage(newDocument, "error",
                            "CANNOT LOAD ABSTRACT! file: " + filename + "\n")
            # TODO: LOAD first paragraph as abstract
        else:
            newSection_id = newDocument.addSection("root", "Abstract")
            newPar_id = newDocument.addParagraph(newSection_id)

            for s in abstract.findChildren("a-s"):
                newSent_id = newDocument.addSentence(newPar_id, s.text)
                self.loadAttributesIfPresent(
                    s, ["ia", "az", "refid"],
                    newDocument.element_by_id[newSent_id])

            newDocument.abstract = newDocument.element_by_id[newSection_id]

        for div in soup.findAll("div"):
            self.loadStructureProcessDiv(div, newDocument)

            # try to match each citation with its reference
            self.matchCitationsWithReferences(newDocument)

        # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
        # No functiona por: unicode
        ##    for ref in newDocument["references"]:
        ##        k=ref.get("AZ",["NO AZ"])
        ##        print k, most_common(k)

        return newDocument
示例#18
0
def loadSciXML(filename):
    """
        Load a Cambridge-style SciXML

    """
    def extractSentenceText(s, newSent_id, doc):
        """
            Returns a printable representation of the sentence where all references are now placeholders with numbers
        """
        global ref_rep_count
        ref_rep_count = 0

        newSent = doc.element_by_id[newSent_id]

        def repFunc(match):
            """
            """
            global ref_rep_count
            ref_rep_count += 1

            res = " <CIT ID=" + str(
                doc.citation_by_id[newSent["citations"][ref_rep_count -
                                                        1]]["id"]) + " />"
            return res

        text = s.renderContents()
        text = re.sub(r"<ref.*?</ref>", repFunc, text, 0,
                      re.IGNORECASE | re.DOTALL)
        text = re.sub(r"</?refauthor>", "", text, 0, re.IGNORECASE | re.DOTALL)
        return text

    def loadStructureProcessPara(p, newDocument, parent):
        newPar_id = newDocument.addParagraph(parent)

        for s in p.findChildren("s"):
            newSent_id = newDocument.addSentence(newPar_id, "")
            newSent = newDocument.element_by_id[newSent_id]
            loadAttributesIfPresent(s, ["ia", "az", "refid"], newSent)
            refs = s.findAll("ref")
            num = len(newDocument["citations"]) + 1
            ##            for cit in citations:
            ##                r["citation_id"]=num
            ##                num+=1
            loaded_refs = [
                loadCitation(r, newSent_id, newDocument, parent) for r in refs
            ]

            newSent["citations"] = [aref["id"] for aref in loaded_refs]
            newSent["text"] = extractSentenceText(s, newSent_id, newDocument)
            newDocument.countMultiCitations(
                newSent
            )  # deal with many citations within characters of each other: make them know they are a cluster TODO cluster them

        return newPar_id

    def loadStructureProcessDiv(div, newDocument):
        header = div.find("header")
        if not header:
            header_id = 0
            header_text = ""
        else:
            header_id = header["id"] or 0
            header_text = re.sub(r"</?header.*?>", "", header.__repr__())

        newSection_id = newDocument.addSection("root", header_text, header_id)

        for p in div.findAll("p"):
            newPar_id = loadStructureProcessPara(p, newDocument, newSection_id)

    def loadMetadataIfExists(branch, key, doc):
        meta = branch.find(key)
        if meta:
            doc["metadata"][key] = meta.text

    def loadAttributesIfPresent(branch, attributes, sent):
        """
            For each element in attributes, if present in branch, it is added to sent
        """
        for a in attributes:
            if a in branch:
                sent[a] = branch[a]

    def loadMetadata(newDocument, paper, fileno):
        """
            Does all the painful stuff of trying to recover metadata from inside a badly converted
            SciXML file
        """
        title = paper.findChildren("title")
        newDocument["metadata"]["title"] = title[0].text if len(
            title) > 0 else "NO TITLE"

        if fileno == "":
            fileno = paper.find("fileno").text

        newDocument["metadata"]["fileno"] = fileno

        authors = []
        meta = soup.find("metadata")
        if not meta:
            debugAddMessage(newDocument, "error",
                            "NO METADATA IN DOCUMENT! file:" + filename)
            return newDocument

        for a in meta.findChildren("author"):
            authors.append(processPlainTextAuthor(a.text))

        if authors == []:
            authorlist = soup.find("authorlist")

        if authorlist:
            for author in authorlist.findChildren("refauthor"):
                authors.append(author.text)

            if authors == []:
                authors = extractAuthorsFromAuthorlist(authorlist)

        appeared = meta.find("appeared")
        if appeared:
            loadMetadataIfExists(appeared, "conference", newDocument)
            loadMetadataIfExists(appeared, "year", newDocument)

        newDocument["metadata"]["authors"] = authors
        newDocument["metadata"]["year"] = meta.find("year").text

    def sanitizeString(s, maxlen=200):
        s = s.replace("\t", " ")
        s = s[:maxlen]
        return s

    def makeSureValuesAreReadable(newDocument):
        newDocument["metadata"]["title"] = sanitizeString(
            newDocument["metadata"]["title"])
        newAuthors = []
        for author in newDocument["metadata"]["authors"]:
            newAuthors.append(sanitizeString(author, 70))
        newDocument["metadata"]["authors"] = newAuthors

        newSurnames = []
        for surname in newDocument["metadata"]["surnames"]:
            newSurnames.append(sanitizeString(surname, 25))
        newDocument["metadata"]["surnames"] = newSurnames

        newDocument["metadata"]["year"] = sanitizeString(
            newDocument["metadata"]["year"])
        if "conference" in newDocument["metadata"]:
            newDocument["metadata"]["conference"] = sanitizeString(
                newDocument["metadata"]["conference"])

    def matchCitationsWithReferences(newDocument):
        """
            Match each citation with its reference
        """
        allcitations = []
        for s in newDocument.allsentences:
            for citation_id in s["citations"]:
                cit = newDocument.citation_by_id[citation_id]

                if cit["ref_id"] != 0:  # the citation already has a matching reference id in the original document, use it
                    match = findMatchingReferenceByOriginalId(
                        cit["ref_id"], newDocument)
                    if not match:
                        ##                        print cit
                        match = newDocument.matchReferenceById(cit["ref_id"])
                else:
                    # attempt to guess which reference the citation should point to
                    match = matchCitationWithReference(cit["original_text"],
                                                       newDocument)

                if match:
                    # whatever the previous case, make sure citation points to the ID of its reference
                    cit["ref_id"] = match["id"]
                    match["citations"].append(
                        cit["id"]
                    )  # add the citation ID to the reference's list of citations
                    cit.pop("authors", "")
                    cit.pop("date", "")
                    cit.pop("original_text", "")
                else:
                    debugAddMessage(
                        newDocument, "notes",
                        "NO MATCH for CITATION in REFERENCES: " +
                        cleanxml(cit["original_text"]) + ", ")
                    pass

    # main loadSciXML
    text = loadFileText(filename)
    soup = BeautifulStoneSoup(text)

    fileno = soup.find("docno")
    fileno = fileno.text if fileno else ""

    # Create a new SciDoc to store the paper
    newDocument = scidoc.SciDoc()
    newDocument["metadata"]["filename"] = os.path.basename(filename)
    newDocument["metadata"]["filepath"] = filename

    paper = soup.find("paper")
    if not paper:
        debugAddMessage(newDocument, "error",
                        "NO <PAPER> IN THIS PAPER! file: " + filename)
        return newDocument

    # Load metadata, either from corpus or from file
    key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
    if key in cp.Corpus.metadata_index:
        metadata = cp.Corpus.metadata_index[key]
    else:
        metadata = None

    if metadata:
        newDocument["metadata"]["conference"] = ""
        for field in metadata:
            newDocument["metadata"][field] = metadata[field]
    else:
        loadMetadata(newDocument, paper, fileno)
        debugAddMessage(newDocument, "error",
                        "PAPER NOT IN METADATA FILE! file: " + filename)

    newDocument["metadata"]["guid"] = cp.Corpus.generateGUID(
        newDocument["metadata"])

    # Clean up potential weird text in XML metadata
    makeSureValuesAreReadable(newDocument)

    # Load all references from the XML
    for ref in soup.findAll("reference"):
        processReferenceXML(ref, newDocument)

    # Load Abstract
    abstract = soup.find("abstract")
    if not abstract:
        debugAddMessage(newDocument, "error",
                        "CANNOT LOAD ABSTRACT! file: " + filename + "\n")
        # TODO: LOAD first paragraph as abstract
    else:
        newSection_id = newDocument.addSection("root", "Abstract")
        newPar_id = newDocument.addParagraph(newSection_id)

        for s in abstract.findChildren("a-s"):
            newSent_id = newDocument.addSentence(newPar_id, s.text)
            loadAttributesIfPresent(s, ["ia", "az", "refid"],
                                    newDocument.element_by_id[newSent_id])

        newDocument.abstract = newDocument.element_by_id[newSection_id]

    for div in soup.findAll("div"):
        loadStructureProcessDiv(div, newDocument)

    # try to match each citation with its reference
    matchCitationsWithReferences(newDocument)

    # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
    # No functiona por: unicode
    ##    for ref in newDocument["references"]:
    ##        k=ref.get("AZ",["NO AZ"])
    ##        print k, most_common(k)

    return newDocument
示例#19
0
def loadAZSciXML(filename):
    """
        Load a Cambridge-style SciXML

    """

    # main loadSciXML
    text = loadFileText(filename)
    soup = BeautifulStoneSoup(text)

    fileno = soup.find("docno")
    fileno = fileno.text if fileno else ""

    # Create a new SciDoc to store the paper
    newDocument = SciDoc()
    newDocument["metadata"]["filename"] = os.path.basename(filename)
    newDocument["metadata"]["filepath"] = filename

    paper = soup.find("paper")
    if not paper:
        debugAddMessage(newDocument, "error",
                        "NO <PAPER> IN THIS PAPER! file: " + filename)
        return newDocument

    # Load metadata, either from corpus or from file
##    key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
##    if cp.Corpus.metadata_index.has_key(key):
##        metadata=cp.Corpus.metadata_index[key]
##    else:
    metadata = None

    if metadata:
        newDocument["metadata"]["conference"] = ""
        for field in metadata:
            newDocument["metadata"][field] = metadata[field]
    else:
        loadMetadata(newDocument, paper, fileno, soup)


##        debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename)

    newDocument["metadata"]["guid"] = cp.Corpus.generateGUID(
        newDocument["metadata"])

    # Clean up potential weird text in XML metadata
    ##    makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff

    # Load all references (at the end of the document) from the XML
    for ref in soup.findAll("reference"):
        processReferenceXML(ref, newDocument)

    # Load Abstract
    abstract = soup.find("abstract")
    if not abstract:
        debugAddMessage(newDocument, "error",
                        "CANNOT LOAD ABSTRACT! file: " + filename + "\n")
        # TODO: LOAD first paragraph as abstract
    else:
        newSection_id = newDocument.addSection("root", "Abstract")
        newPar_id = newDocument.addParagraph(newSection_id)

        for s in abstract.findChildren("a-s"):
            addNewSentenceAndProcessRefs(
                s, newDocument, newPar_id,
                newSection_id)  # deals with all of the adding of a sentence

        newDocument.abstract = newDocument.element_by_id[newSection_id]

    for div in soup.findAll("div"):
        loadStructureProcessDiv(div, newDocument)

    # try to match each citation with its reference
    matchCitationsWithReferences(newDocument)

    # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
    # No functiona por: unicode
    ##    for ref in newDocument["references"]:
    ##        k=ref.get("AZ",["NO AZ"])
    ##        print k, most_common(k)

    return newDocument