class TestBibtexParsing2(unittest.TestCase): """ more tests """ def setUp(self): self.parser = BibtexParser() def testBibtexWithCustomFieldnames(self): source = open(setup.BIBTEX_TEST_BIB2, 'r').read() results = self.parser.getEntries(source) r1 = results[0] self.assertEqual(r1['month'], 'Mar') self.assertEqual(r1['doi'], '10.1002/(ISSN)1097-0231') self.assertEqual(r1['date-added'], '2008-08-06 17:48:48 +0200') self.assertEqual(r1['rating'], '0') self.assertEqual(r1['keywords'], ['biology', 'chemistry']) r2 = results[1] self.assertEqual(r2['keywords'], ['something strange']) def testBibtexEncodedChars(self): source = open(setup.BIBTEX_TEST_BIB3, 'r').read() results = self.parser.getEntries(source) self.assertEqual(len(results), 2) r = results[0] self.assertEqual(r['title'], unicode('Der Fürst', 'iso-8859-15').encode('utf-8')) self.assertEqual( r['publisher'], unicode('Alfred Körner Verlag', 'iso-8859-15').encode('utf-8')) r = results[1] self.assertEqual(r['address'], unicode('Göttingen', 'iso-8859-15').encode('utf-8'))
class TestBibtexParsing2(unittest.TestCase): """ more tests """ def setUp(self): self.parser = BibtexParser() def testBibtexWithCustomFieldnames(self): source = open(setup.BIBTEX_TEST_BIB2, 'r').read() results = self.parser.getEntries(source) r1 = results[0] self.assertEqual(r1['month'], 'Mar') self.assertEqual(r1['doi'], '10.1002/(ISSN)1097-0231') self.assertEqual(r1['date-added'], '2008-08-06 17:48:48 +0200') self.assertEqual(r1['rating'], '0') self.assertEqual(r1['keywords'], ['biology', 'chemistry']) r2 = results[1] self.assertEqual(r2['keywords'], ['something strange']) def testBibtexEncodedChars(self): source = open(setup.BIBTEX_TEST_BIB3, 'r').read() results = self.parser.getEntries(source) self.assertEqual(len(results), 2) r = results[0] self.assertEqual(r['title'], unicode('Der Fürst', 'iso-8859-15').encode('utf-8')) self.assertEqual(r['publisher'], unicode('Alfred Körner Verlag', 'iso-8859-15').encode('utf-8')) r = results[1] self.assertEqual(r['address'], unicode('Göttingen', 'iso-8859-15').encode('utf-8'))
def extract_all_bibtex(html, start, limit): """ Look up Bibtex links to obtain the publication information """ from BeautifulSoup import BeautifulSoup from bibliograph.parsing.parsers.bibtex import BibtexParser bp = BibtexParser() html = html.decode('ascii', 'ignore') soup = BeautifulSoup(html) results = [] for irec, record in enumerate(soup('div', attrs={'class': 'gs_ri'})): print start + irec #Skip records that are just citations, as they are often erroneous if str(record.contents[0]).find('CITATION') > -1: continue #If there's not BibTeX link, we're at the end: if str(record.contents[-1]).find('Import') == -1: break #if irec==limit-1: #The last entry is special #Bibtex links are tagged gs_fl links = record.find('div', {'class': 'gs_fl'}) biblink = [link for link in links('a') if 'bib?' in str(link)] biblink = biblink[0] #else: # biblink=record('a')[-1] url_end = str(biblink.attrs[0][1]) url = SEARCH_HOST + url_end print url req = Request(url, None, headers) try: handle = urlopen(req) except: print 'Search did not finish -- GScholar blocked you!' print 'restart at ', start + irec return irec, results, False bibtex_entry = handle.read() handle.close() bibrec = bp.parseEntry(bibtex_entry) try: print bibrec['pid'] except: print 'something weird happened!!!!' return irec, results, True #Try to ignore junk entries if bibrec.has_key('publication_year'): if bibrec['publication_year'] is not '': results.append(bibrec) print 'accepted' else: print 'rejected' sleep(30) #Go slowly so we aren't flagged as a bot nrec = len(soup('p')) - 2 if nrec == limit: return nrec, results, True else: return nrec, results, False
def bibfile2dictlist(fname, do_postprocess=True, scopus=False, printupdates=False): """ Takes a *.bib file name as input, and returns a list, with each element a dictionary corresponding to one of the BibTeX entries in the file. This should really be rewritten as a proper parser. Issues: - Chokes on blank lines in the middle of bibtex entries """ from bibliograph.parsing.parsers.bibtex import BibtexParser import time bp = BibtexParser() f = file(fname) line = f.readline() biblist = [] entry = '' while True: try: line = f.readline() except: bibrec = bp.parseEntry(entry) biblist.append(bibrec) if do_postprocess: biblist = postprocess(biblist) return biblist if line.startswith('@'): bibrec = bp.parseEntry(entry) if type(bibrec) is dict: biblist.append(bibrec) else: print 'Not a bibtex entry: ' + entry entry = line if printupdates: print len(biblist) continue else: if scopus: #Scopus messes up the author format if line.strip().startswith('author='): line = line.replace('a ', ' ') line = line.replace('b ', ' ') line = line.replace('c ', ' ') line = line.replace('d ', ' ') line = line.replace('e ', ' ') line = line.replace(' , ', ' and ') line = line.replace('., ', '. and ') entry = entry + line if len(line) == 0: bibrec = bp.parseEntry(entry) if type(bibrec) is dict: biblist.append(bibrec) else: print 'Not a bibtex entry: ' + entry if do_postprocess: biblist = postprocess(biblist) return biblist if do_postprocess: biblist = postprocess(biblist) return biblist
def __init__(self, id = 'endnote', title = "EndNote's text format parser" ): """ initializes including the regular expression patterns """ BaseParser.__init__(self, id=id, title=title)
def __init__(self, id = 'xml_mods', title = "XML(MODS) parser" ): """ initializes including the regular expression patterns """ BaseParser.__init__(self, id=id, title=title)
def __init__(self, id = 'ris', title = "RIS format parser" ): """ initializes including the regular expression patterns """ BaseParser.__init__(self, id=id, title=title)
def extract_all_bibtex(html,start,limit): """ Look up Bibtex links to obtain the publication information """ from BeautifulSoup import BeautifulSoup from bibliograph.parsing.parsers.bibtex import BibtexParser bp=BibtexParser() html = html.decode('ascii', 'ignore') soup = BeautifulSoup(html) results=[] for irec, record in enumerate(soup('div',attrs={'class':'gs_ri'})): print start+irec #Skip records that are just citations, as they are often erroneous if str(record.contents[0]).find('CITATION')>-1: continue #If there's not BibTeX link, we're at the end: if str(record.contents[-1]).find('Import')==-1: break #if irec==limit-1: #The last entry is special #Bibtex links are tagged gs_fl links=record.find('div',{'class':'gs_fl'}) biblink=[link for link in links('a') if 'bib?' in str(link)] biblink=biblink[0] #else: # biblink=record('a')[-1] url_end=str(biblink.attrs[0][1]) url = SEARCH_HOST+url_end print url req=Request(url,None,headers) try: handle=urlopen(req) except: print 'Search did not finish -- GScholar blocked you!' print 'restart at ', start+irec return irec,results,False bibtex_entry=handle.read() handle.close() bibrec=bp.parseEntry(bibtex_entry) try: print bibrec['pid'] except: print 'something weird happened!!!!' return irec,results,True #Try to ignore junk entries if bibrec.has_key('publication_year'): if bibrec['publication_year'] is not '': results.append(bibrec) print 'accepted' else: print 'rejected' sleep(30) #Go slowly so we aren't flagged as a bot nrec=len(soup('p'))-2 if nrec==limit: return nrec, results, True else: return nrec, results, False
def bibfile2dictlist(fname,do_postprocess=True,scopus=False,printupdates=False): """ Takes a *.bib file name as input, and returns a list, with each element a dictionary corresponding to one of the BibTeX entries in the file. This should really be rewritten as a proper parser. Issues: - Chokes on blank lines in the middle of bibtex entries """ from bibliograph.parsing.parsers.bibtex import BibtexParser import time bp=BibtexParser() f=file(fname) line=f.readline() biblist=[] entry='' while True: try: line=f.readline() except: bibrec=bp.parseEntry(entry) biblist.append(bibrec) if do_postprocess: biblist = postprocess(biblist) return biblist if line.startswith('@'): bibrec=bp.parseEntry(entry) if type(bibrec) is dict: biblist.append(bibrec) else: print 'Not a bibtex entry: '+entry entry=line if printupdates: print len(biblist) continue else: if scopus: #Scopus messes up the author format if line.strip().startswith('author='): line=line.replace('a ',' ') line=line.replace('b ',' ') line=line.replace('c ',' ') line=line.replace('d ',' ') line=line.replace('e ',' ') line=line.replace(' , ',' and ') line=line.replace('., ','. and ') entry=entry+line if len(line)==0: bibrec=bp.parseEntry(entry) if type(bibrec) is dict: biblist.append(bibrec) else: print 'Not a bibtex entry: '+entry if do_postprocess: biblist = postprocess(biblist) return biblist if do_postprocess: biblist = postprocess(biblist) return biblist
def parsefile(filename): """ Takes a file name (string, including path) and returns a list of dictionaries, one dictionary for each bibtex entry in the file. Uses the bibliograph.parsing package. """ from bibliograph.parsing.parsers.bibtex import BibtexParser bp = BibtexParser() f = file(filename) ents = [bp.parseEntry(x) for x in bp.splitSource(f.read())] f.close() #Parsing errors give strings, so keep only dicts: entries = [x for x in ents if x.__class__ is dict] return entries
def parsefile(filename): """ Takes a file name (string, including path) and returns a list of dictionaries, one dictionary for each bibtex entry in the file. Uses the bibliograph.parsing package. """ from bibliograph.parsing.parsers.bibtex import BibtexParser bp=BibtexParser() f=file(filename) ents = [bp.parseEntry(x) for x in bp.splitSource(f.read())] f.close() #Parsing errors give strings, so keep only dicts: entries=[x for x in ents if x.__class__ is dict] return entries
def __init__(self, id='ris', title="RIS format parser"): """ initializes including the regular expression patterns """ BaseParser.__init__(self, id=id, title=title)
def setUp(self): self.parser = BibtexParser()
class TestBibtexParsing(unittest.TestCase): """ """ def setUp(self): self.parser = BibtexParser() def testFormatDetection(self): source_files = (setup.MEDLINE_TEST_BIB, setup.BIBTEX_TEST_BIB, setup.IDCOOKING_TEST_BIB, setup.PDFFOLDER_TEST_BIB, setup.BIBTEX_TEST_BIB_DUP, setup.BIBTEX_TEST_MULTI_AUTHORS, setup.BIBTEX_TEST_INBOOKREFERENCES, setup.BIBTEX_TEST_LASTFIELDKOMMA, setup.BIBTEX_TEST_TYPEFIELD, setup.BIBTEX_TEST_CITE_KEY) for source_file in source_files: source = open(source_file, 'r').read() self.failUnless( self.parser.checkFormat(source), 'BibTeX parser failed to detect BibTeX format in file %s' % source_file) # check negative detection (check properly rejects non-bibtex format files) source = open(setup.MEDLINE_TEST_MED, 'r').read() self.failIf( self.parser.checkFormat(source), 'BibTeX parser incorrectly detected BibTeX format in file %s' % setup.MEDLINE_TEST_MED) def testBibtexAuthorParsing(self): source = open(setup.BIBTEX_TEST_MULTI_AUTHORS, 'r').read() source = self.parser.preprocess(source) result = self.parser.parseEntry(source) heckman = { 'middlename': 'J.', 'firstname': 'James', 'lastname': 'Heckman' } carneiro = { 'middlename': '', 'firstname': 'Pedro', 'lastname': 'Carneiro' } self.failUnless(len(result['authors']) == 2) author1 = result['authors'][0] self.failUnless(author1['middlename'] == carneiro['middlename']) self.failUnless(author1['firstname'] == carneiro['firstname']) self.failUnless(author1['lastname'] == carneiro['lastname']) author2 = result['authors'][1] self.failUnless(author2['middlename'] == heckman['middlename']) self.failUnless(author2['firstname'] == heckman['firstname']) self.failUnless(author2['lastname'] == heckman['lastname']) def testBibtexInbookReferenceParsing(self): source = open(setup.BIBTEX_TEST_INBOOKREFERENCES, 'r').read() ref = { 'booktitle': 'In einem fiktiven Buch vor unserer Zeit', 'title': 'Die Tage der Ankunft', 'chapter': 'Die Tage der Ankunft', 'publication_url': 'http://www.sunweavers.net/', } source = self.parser.preprocess(source) result = self.parser.parseEntry(source) for key in ref.keys(): self.failUnless( result.has_key(key) and (ref[key] == result[key]), key) def testAnnoteParsing(self): source = open(setup.BIBTEX_TEST_BIB, 'r').read() results = self.parser.getEntries(source) self.failUnless(results[-1]['annote'] == 'I really like it.') def testIdentifierParsing(self): source = open(setup.BIBTEX_TEST_BIB, 'r').read() results = self.parser.getEntries(source) result = results[2] self.assertEqual(result['identifiers'], [{ 'label': 'ISBN', 'value': '3874402436' }, { 'label': 'DOI', 'value': '1-23-345' }]) def testBibtexTypeFieldParsing(self): source = open(setup.BIBTEX_TEST_TYPEFIELD, 'r').read() ref = { 'publication_type': 'Doktorarbeit', 'title': 'Mein Herr Doktor', 'school': 'CAU Kiel', 'institution': 'Ökologie-Zentrum', } source = self.parser.checkEncoding(source) source = self.parser.preprocess(source) result = self.parser.parseEntry(source) for key in ref.keys(): self.failUnless(result.has_key(key) and (ref[key] == result[key])) def testBibtexTypeLastFieldTrailingKomma(self): source = open(setup.BIBTEX_TEST_LASTFIELDKOMMA, 'r').read() results = self.parser.getEntries(source) # the last field in a bibtex entry always had a trailing "," self.failUnless(len(results) == 2) self.failUnless(results[0]['institution'] == results[1]['institution']) self.failUnless( results[0]['publication_type'] == results[1]['publication_type']) self.failUnless(results[0]['publication_type'] == 'Doktorarbeit,,,')
def parseEntry(self, entry): return fixupresult(BibtexParser.parseEntry(self, entry))
class TestBibtexParsing(unittest.TestCase): """ """ def setUp(self): self.parser = BibtexParser() def testFormatDetection(self): source_files = (setup.MEDLINE_TEST_BIB, setup.BIBTEX_TEST_BIB, setup.IDCOOKING_TEST_BIB, setup.PDFFOLDER_TEST_BIB, setup.BIBTEX_TEST_BIB_DUP, setup.BIBTEX_TEST_MULTI_AUTHORS, setup.BIBTEX_TEST_INBOOKREFERENCES, setup.BIBTEX_TEST_LASTFIELDKOMMA, setup.BIBTEX_TEST_TYPEFIELD, setup.BIBTEX_TEST_CITE_KEY) for source_file in source_files: source = open(source_file, 'r').read() self.failUnless(self.parser.checkFormat(source), 'BibTeX parser failed to detect BibTeX format in file %s' % source_file) # check negative detection (check properly rejects non-bibtex format files) source = open(setup.MEDLINE_TEST_MED, 'r').read() self.failIf(self.parser.checkFormat(source), 'BibTeX parser incorrectly detected BibTeX format in file %s' % setup.MEDLINE_TEST_MED) def testBibtexAuthorParsing(self): source = open(setup.BIBTEX_TEST_MULTI_AUTHORS, 'r').read() source = self.parser.preprocess(source) result = self.parser.parseEntry(source) heckman = {'middlename': 'J.', 'firstname' : 'James', 'lastname' : 'Heckman'} carneiro = {'middlename': '', 'firstname' : 'Pedro', 'lastname' : 'Carneiro'} self.failUnless( len( result['authors'] ) == 2 ) author1 = result['authors'][0] self.failUnless(author1['middlename'] == carneiro['middlename']) self.failUnless(author1['firstname'] == carneiro['firstname']) self.failUnless(author1['lastname'] == carneiro['lastname']) author2 = result['authors'][1] self.failUnless(author2['middlename'] == heckman['middlename']) self.failUnless(author2['firstname'] == heckman['firstname']) self.failUnless(author2['lastname'] == heckman['lastname']) def testBibtexInbookReferenceParsing(self): source = open(setup.BIBTEX_TEST_INBOOKREFERENCES, 'r').read() ref = { 'booktitle': 'In einem fiktiven Buch vor unserer Zeit', 'title': 'Die Tage der Ankunft', 'chapter': 'Die Tage der Ankunft', 'publication_url': 'http://www.sunweavers.net/', } source = self.parser.preprocess(source) result = self.parser.parseEntry(source) for key in ref.keys(): self.failUnless( result.has_key(key) and (ref[key] == result[key]),key ) def testAnnoteParsing(self): source = open(setup.BIBTEX_TEST_BIB, 'r').read() results = self.parser.getEntries(source) self.failUnless(results[-1]['annote'] == 'I really like it.') def testIdentifierParsing(self): source = open(setup.BIBTEX_TEST_BIB, 'r').read() results = self.parser.getEntries(source) result = results[2] self.assertEqual(result['identifiers'], [{'label' : 'ISBN', 'value' : '3874402436'}, {'label' : 'DOI', 'value' : '1-23-345'}]) def testBibtexTypeFieldParsing(self): source = open(setup.BIBTEX_TEST_TYPEFIELD, 'r').read() ref = { 'publication_type': 'Doktorarbeit', 'title': 'Mein Herr Doktor', 'school': 'CAU Kiel', 'institution': 'Ökologie-Zentrum', } source = self.parser.checkEncoding(source) source = self.parser.preprocess(source) result = self.parser.parseEntry(source) for key in ref.keys(): self.failUnless( result.has_key(key) and (ref[key] == result[key]) ) def testBibtexTypeLastFieldTrailingKomma(self): source = open(setup.BIBTEX_TEST_LASTFIELDKOMMA, 'r').read() results = self.parser.getEntries(source) # the last field in a bibtex entry always had a trailing "," self.failUnless( len(results) == 2 ) self.failUnless( results[0]['institution'] == results[1]['institution'] ) self.failUnless( results[0]['publication_type'] == results[1]['publication_type'] ) self.failUnless( results[0]['publication_type'] == 'Doktorarbeit,,,' )