示例#1
0
文件: pdfparser.py 项目: rafed/showme
def extractReferences(extracted_text, paras):
    extracted_text = extracted_text[extracted_text.lower().find("references") +
                                    10:]
    text = extracted_text.strip()
    text = text.split('[')

    bibs = []
    citeKey = []

    for tex in text:
        tex_split = tex.split(']')
        if len(tex_split) < 2:
            continue
        ref = tex_split[1]
        bibs.append(ref.replace('\n', ' '))
        citeKey.append(str(tex_split[0]))

    citeSnippets = extractCitationSnippets(citeKey, paras)

    print("[*] Sending request to freecite API...")

    HOST = 'http://freecite.library.brown.edu/citations/create'
    data = {"citation[]": bibs}
    r = requests.post(HOST, data=data, headers={"Accept": "text/xml"})

    print("[*] Received xml. Parsing xml...")

    xml = r.text
    etree = ET.fromstring(xml.encode('utf-8'))

    cites = []
    counter = 0
    for citations in etree.findall("citation"):
        cite = Util.getCite(citations)

        if citeKey[counter] in citeSnippets.keys():
            cite['snippets'] = citeSnippets[citeKey[counter]]
        else:
            cite['snippets'] = []

        counter += 1
        cites.append(cite)

    return cites
示例#2
0
    def testGetCites(self):
        xml = '''<citations><citation valid='true'><authors><author>I S Udvarhelyi</author><author>C A Gatsonis</author><author>A M Epstein</author><author>C L Pashos</author><author>J P Newhouse</author><author>B J McNeil</author></authors><title>Acute Myocardial Infarction in the Medicare population: process of care and clinical outcomes</title><journal>Journal of the American Medical Association</journal><volume>18</volume><pages>2530--2536</pages><year>1992</year><raw_string>Udvarhelyi, I.S., Gatsonis, C.A., Epstein, A.M., Pashos, C.L., Newhouse, J.P. and McNeil, B.J. Acute Myocardial Infarction in the Medicare population: process of care and clinical outcomes. Journal of the American Medical Association, 1992; 18:2530-2536.</raw_string></citation><ctx:context-objects xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xsi:schemaLocation='info:ofi/fmt:xml:xsd:ctx http://www.openurl.info/registry/docs/info:ofi/fmt:xml:xsd:ctx' xmlns:ctx='info:ofi/fmt:xml:xsd:ctx'><ctx:context-object timestamp='2018-03-21T17:40:19-04:00' encoding='info:ofi/enc:UTF-8' version='Z39.88-2004' identifier=''><ctx:referent><ctx:metadata-by-val><ctx:format>info:ofi/fmt:xml:xsd:journal</ctx:format><ctx:metadata><journal xmlns:rft='info:ofi/fmt:xml:xsd:journal' xsi:schemaLocation='info:ofi/fmt:xml:xsd:journal http://www.openurl.info/registry/docs/info:ofi/fmt:xml:xsd:journal'><rft:atitle>Acute Myocardial Infarction in the Medicare population: process of care and clinical outcomes</rft:atitle><rft:spage>2530</rft:spage><rft:date>1992</rft:date><rft:stitle>Journal of the American Medical Association</rft:stitle><rft:genre>article</rft:genre><rft:volume>18</rft:volume><rft:epage>2536</rft:epage><rft:au>I S Udvarhelyi</rft:au><rft:au>C A Gatsonis</rft:au><rft:au>A M Epstein</rft:au><rft:au>C L Pashos</rft:au><rft:au>J P Newhouse</rft:au><rft:au>B J McNeil</rft:au></journal></ctx:metadata></ctx:metadata-by-val></ctx:referent></ctx:context-object></ctx:context-objects></citations>'''
        output = {
            'authors': [
                "Udvarhelyi", "Gatsonis", "Epstein", "Pashos", "Newhouse",
                "McNeil"
            ],
            "title":
            "Acute Myocardial Infarction in the Medicare population: process of care and clinical outcomes",
            "journal":
            "Journal of the American Medical Association",
            "volume":
            "18",
            "pages":
            "2530--2536",
            "year":
            "1992"
        }

        etree = ET.fromstring(xml)
        for i in etree.findall("citation"):
            self.assertEqual(Util.getCite(i), output)