예제 #1
0
    def _new_nexson_with_crossref_metadata(doi, ref_string, include_cc0=False):
        if doi:
            # use the supplied DOI to fetch study metadata
            search_term = doi
        elif ref_string:
            # use the supplied reference text to fetch study metadata
            search_term = ref_string

        # look for matching studies via CrossRef.org API
        doi_lookup_response = fetch(
            'http://search.crossref.org/dois?%s' % 
            urlencode({'q': search_term})
        )
        doi_lookup_response = unicode(doi_lookup_response, 'utf-8')   # make sure it's Unicode!
        matching_records = anyjson.loads(doi_lookup_response)

        # if we got a match, grab the first (probably only) record
        if len(matching_records) > 0:
            match = matching_records[0];

            # Convert HTML reference string to plain text
            raw_publication_reference = match.get('fullCitation', '')
            ref_element_tree = web2pyHTMLParser(raw_publication_reference).tree
            # root of this tree is the complete mini-DOM
            ref_root = ref_element_tree.elements()[0]
            # reduce this root to plain text (strip any tags)

            meta_publication_reference = ref_root.flatten().decode('utf-8')
            meta_publication_url = match.get('doi', u'')  # already in URL form
            meta_year = match.get('year', u'')
            
        else:
            # Add a bogus reference string to signal the lack of results
            if doi:
                meta_publication_reference = u'No matching publication found for this DOI!'
            else:
                meta_publication_reference = u'No matching publication found for this reference string'
            meta_publication_url = u''
            meta_year = u''

        # add any found values to a fresh NexSON template
        nexson = get_empty_nexson(BY_ID_HONEY_BADGERFISH, include_cc0=include_cc0)
        nexml_el = nexson['nexml']
        nexml_el[u'^ot:studyPublicationReference'] = meta_publication_reference
        if meta_publication_url:
            nexml_el[u'^ot:studyPublication'] = {'@href': meta_publication_url}
        if meta_year:
            nexml_el[u'^ot:studyYear'] = meta_year
        return nexson
예제 #2
0
    def test_web2pyHTMLParser(self):
        #tag should not be a byte
        self.assertEqual(web2pyHTMLParser("<div></div>").tree.components[0].tag, 'div')
        a = str(web2pyHTMLParser('<div>a<span>b</div>c').tree)
        self.assertEqual(a, "<div>a<span>b</span></div>c")

        tree = web2pyHTMLParser('hello<div a="b">world</div>').tree
        tree.element(_a='b')['_c']=5
        self.assertEqual(str(tree), 'hello<div a="b" c="5">world</div>')

        a = str(web2pyHTMLParser('<div><img class="img"/></div>', closed=['img']).tree)
        self.assertEqual(a, '<div><img class="img" /></div>')

        #greater-than sign ( > )  --> decimal &#62; --> hexadecimal &#x3E;
        #Less-than sign    ( < )  --> decimal &#60; --> hexadecimal &#x3C;
        # test decimal
        a = str(web2pyHTMLParser('<div>&#60; &#62;</div>').tree)
        self.assertEqual(a, '<div>&lt; &gt;</div>')
        # test hexadecimal
        a = str(web2pyHTMLParser('<div>&#x3C; &#x3E;</div>').tree)
        self.assertEqual(a, '<div>&lt; &gt;</div>')
예제 #3
0
    def test_web2pyHTMLParser(self):
        #tag should not be a byte
        self.assertEqual(web2pyHTMLParser("<div></div>").tree.components[0].tag, 'div')
        a = str(web2pyHTMLParser('<div>a<span>b</div>c').tree)
        self.assertEqual(a, "<div>a<span>b</span></div>c")

        tree = web2pyHTMLParser('hello<div a="b">world</div>').tree
        tree.element(_a='b')['_c']=5
        self.assertEqual(str(tree), 'hello<div a="b" c="5">world</div>')

        a = str(web2pyHTMLParser('<div><img class="img"/></div>', closed=['img']).tree)
        self.assertEqual(a, '<div><img class="img" /></div>')

        #greater-than sign ( > )  --> decimal &#62; --> hexadecimal &#x3E;
        #Less-than sign    ( < )  --> decimal &#60; --> hexadecimal &#x3C;
        # test decimal
        a = str(web2pyHTMLParser('<div>&#60; &#62;</div>').tree)
        self.assertEqual(a, '<div>&lt; &gt;</div>')
        # test hexadecimal
        a = str(web2pyHTMLParser('<div>&#x3C; &#x3E;</div>').tree)
        self.assertEqual(a, '<div>&lt; &gt;</div>')