def _processResults(self, data): """ Get bibtex data from zbMATH website. """ bibs = re.findall("(?si)bibtex/.*?\d{3,}\.bib", data) data = [] import bibtexparser from bibtexparser.bparser import BibTexParser parser = BibTexParser() parser.customization = customizations if self.otherID: # setup for MRef fetching from msn import MRef mr = MRef() for bib in bibs: bibtext = urllib.urlopen("https://zbmath.org/" + bib).read() zbl = bibtexparser.loads(bibtext, parser=parser) if self.otherID and mr.fetch(bibtext): # found MRef match for zbMATH record msn = bibtexparser.loads(mr.refs) # use MSN bibtex entry with zbl number added # and doi transfered if missing msn.entries[0]['zbl'] = zbl.entries[0]['zbl'] if 'doi' not in msn.entries[0] and 'doi' in zbl.entries[0]: msn.entries[0]['doi'] = zbl.entries[0]['doi'] zbl = msn data.append(bibtexparser.dumps(zbl)) self.refs = "\n".join(data)
def prototype(): md5 = request.forms.md5 bib = request.forms.bib notes = request.forms.notes tags = request.forms.tags.split(',') if md5: doc = Document.select().where(Document.md5 == md5).get() if doc: if notes: doc.notes = notes if bib: try: bibtexparser.loads(bib) doc.bib = bib.strip() except: session()['msg'] = "Invalid bibtex." return redirect('/annotate/'+md5) if tags: with db.atomic(): Tag.delete().where(Tag.document == doc).execute() for tag in tags: try: Tag.insert(document=doc, value=tag).execute() except Exception: pass doc.save() session()['msg'] = " Success" return redirect('/annotate/'+md5) else: session()['msg'] = "Invalid request. No document specified." return redirect('/annotate/'+md5) session()['msg'] = "You missed a field, or something went wrong." return redirect('/annotate/'+md5)
def get_all_sources(): config = get_credentials() start = 0 limit = 100 chunks = [get_sources(config['user'], config['key'], start, limit)] while len(bibtexparser.loads(chunks[-1]).entries) == limit: start += limit # print('Chunk {0} full, getting another one with items {1}-{2}'.format(len(chunks), start, start+limit)) chunks.append(get_sources(config['user'], config['key'], start, limit)) print('Successfully fetched {0} items.'.format((len(chunks)-1) * limit + len(bibtexparser.loads(chunks[-1]).entries))) return '\n'.join(chunks)
def get_publications(path): """ Get a list of all publications. Parameter --------- path : str Path to a BibTeX file. """ with open(path) as bibtex_file: bibtex_str = bibtex_file.read() bib_database = bibtexparser.loads(bibtex_str) months = {} months['jan'] = 1 months['feb'] = 2 months['mar'] = 3 months['apr'] = 4 months['may'] = 5 months['jun'] = 6 months['jul'] = 7 months['aug'] = 8 months['sep'] = 9 months['oct'] = 10 months['nov'] = 11 months['dec'] = 12 return sorted(bib_database.entries, key=lambda n: (n['year'], months[n['month']]), reverse=True)
def parseBibtexFile(fileString): "Opens a bibtext file and prints a list of dictionaries for reference entries" with open(fileString) as bibtex_file: bibtex_str = bibtex_file.read() bib_database = bibtexparser.loads(bibtex_str) return bib_database
def run(self): sort_type = self.options.get('sort', 'date') # Load the publications template if 'template' in self.options: template_path = self.options['template'] template_dir, template_name = os.path.split(template_path) env = Environment(loader=FileSystemLoader(template_dir)) template = env.get_template(template_name) else: # Use template from the Pelican theme template = pelican_generator.get_template('publications') parser = BibTexParser(common_strings=True) parser.customization = customize if self.arguments: bibtex_path = self.arguments[0].strip() with open(bibtex_path, 'r') as bibtex_file: bib = bibtexparser.load(bibtex_file, parser=parser) else: bib = bibtexparser.loads('\n'.join(self.content), parser=parser) entries_to_select = self.options.get('entries', []) if entries_to_select: d = bib.entries_dict entries = [d[e] for e in entries_to_select] else: entries = bib.entries entries = sort_entries(entries, sort_type) rendered_template = template.render(publications=entries) return [nodes.raw('', rendered_template, format='html')]
def main(argv=None) : if argv is None: argv = sys.argv # etc., replacing sys.argv with argv in the getopt() call. filename = "" parser = BibTexParser() parser.customization = customizations if len(argv) > 1 : filename = argv[1] else: filename = "example.bib" with open(filename) as bibtex_file: bibtex_str = bibtex_file.read() bib_database = bibtexparser.loads(bibtex_str, parser=parser) #print_books(bib_database.entries) print_summary(bib_database.entries) print_journals(bib_database.entries) print_conferences(bib_database.entries) return 0;
def read_bib_file(filename, homogenize=False): """ Read bibtex file. Args: filename (str): path of the bibtex file. homogenize (bool): whether to homogenize the entries upon reading. Returns: A BibDatabase object. """ # Read input bibtex file bibtex_str = " " if os.path.exists(filename): with open(filename, 'r', encoding='utf-8') as bibfile: bibtex_str = bibfile.read() # Choose parser parser = None if homogenize: parser = BibTexParser() parser.customization = nomenclature.homogenize_latex_encoding # Create database from string return bibtexparser.loads(bibtex_str, parser=parser)
def main(bibtexfilepath, out_fh, output_type): with open(bibtexfilepath) as bibtex_file: bibtex_str = bibtex_file.read() bib_database = bibtexparser.loads(bibtex_str) #print(bib_database.entries) (topics_to_titles_with_id, id_to_entry) = build_topics_to_titles_with_id(bib_database) ignore_topics = ['', 'misc'] out_fh.write(codecs.open('header.html',encoding="utf-8").read()) # a) create hyperlinks to topics create_hyperlinks_to_topics(topics_to_titles_with_id, ignore_topics, out_fh, output_type=HTML) # b) create list of titles per topic create_list_of_titles_per_topic(topics_to_titles_with_id, ignore_topics, out_fh, output_type=HTML) # c) create bibtex list at the end, that get pointed to by 2 #for pubid in sorted(id_to_entry): # print '''<a name="%s"></a>''' % (pubid) #parser = BibTexParser() #parser.customization = customizations #bib_database = bibtexparser.loads(bibtex_str, parser=parser) #print(bib_database.entries) out_fh.write("<h1>BIBLIOGRAPHY</h1>") out_fh.write("<pre>\n") create_bibtex_bibliography(id_to_entry,out_fh=out_fh,output_type=HTML) out_fh.write("</pre>\n") out_fh.write("</ul>")
def import_bibtex(request): review_id = request.POST['review-id'] source_id = request.POST['source-id'] review = Review.objects.get(pk=review_id) source = Source.objects.get(pk=source_id) bibtex_file = request.FILES['bibtex'] list_bibtex_file = fix_bibtex_file(bibtex_file.readlines()) str_bibtex_file = '\r\n'.join(list_bibtex_file) ext = os.path.splitext(bibtex_file.name)[1] valid_extensions = ['.bib', '.bibtex'] if ext in valid_extensions or bibtex_file.content_type == 'application/x-bibtex': parser = BibTexParser() parser.customization = convert_to_unicode bib_database = bibtexparser.loads(str_bibtex_file, parser=parser) articles = bibtex_to_article_object(bib_database, review, source) import pdb; pdb.set_trace() _import_articles(request, source, articles) else: messages.error(request, u'Invalid file type. Only .bib or .bibtex files are accepted.') return redirect(r('import_studies', args=(review.author.username, review.name)))
def get_latest_version(arxiv_id): """ Find the latest version of a given arXiv eprint. :param arxiv_id: The (canonical) arXiv ID to query. :returns: The latest version on eprint as a string, or ``None``. >>> get_latest_version('1401.2910') '1401.2910v1' >>> get_latest_version('1401.2910v1') '1401.2910v1' >>> get_latest_version('1506.06690v1') '1506.06690v2' >>> get_latest_version('1506.06690') '1506.06690v2' """ # Get updated bibtex # Trick: strip the version from the arXiv id, to query updated BibTeX for # the preprint and not the specific version arxiv_preprint_id = strip_version(arxiv_id) updated_bibtex = bibtexparser.loads(get_bibtex(arxiv_preprint_id)) updated_bibtex = next(iter(updated_bibtex.entries_dict.values())) try: return updated_bibtex["eprint"] except KeyError: return None
def process_bibtex(self,item): ''' process all bibtex links and update general self.args.bibtex_database :param tuple item: (identifier,bibtex_url,bibtex_pdf) ''' identifier,bibtex_url,bibtex_pdf = item response = requests.get(bibtex_url) if response.status_code == 200: #load bibtex as dict bibtex_string = response.text[1:] bibtex = bibtexparser.loads(bibtex_string) bibtex.entries_dict[identifier]['Keyword'] = self.args.date #add pdf link to bibtex if 'url' not in bibtex.entries_dict[identifier]: bibtex.entries_dict[identifier]['url'] = bibtex_pdf self.total_found += 1 if identifier not in args.bibtex_database.entries_dict: self.total_added += 1 self.args.bibtex_database.entries.append(bibtex.entries[0])
def _bibtexQuery(self, query): """ Turn query into bibtex dictionary. """ import bibtexparser from bibtexparser.bparser import BibTexParser parser = BibTexParser() parser.customization = homogeneize_latex_encoding bib = bibtexparser.loads(query, parser=parser) if bib.entries: # only the first record record = bib.entries[0] # clean up entries if "author" in record: # just last name record["author"] = re.sub(r',.*?(and\s*|$)', ' ', record['author']) if "title" in record: record["title"] = self._citationQuery(record["title"])[0][1] if "journal" in record: record["journal"] = self._citationQuery(record["journal"])[0][1] if "year" in record: record["date"] = record["year"] # only use a few fields # TODO add numbers return [(k, v) for k, v in record.items() if k in {"author", "title", "journal", "mrnumber", "date", "arxiv", "zbl"}] else: return []
def normalize_keyword_case(): for d in review.documents: bib = bibtexparser.loads(d.bib) if bib.entries[0].has_key('keyword'): bib.entries[0]['keyword'] = bib.entries[0]['keyword'].lower() d.bib = bibtexparser.dumps(bib) d.save()
def parse_bibtex(bib): '''Parses the BibTex returned by the DOI resolver Args: bib (str): a BibTex record Returns: Dict containing reference data ''' for entity, repl in ENTITIES.iteritems(): bib = bib.replace(entity, repl) # Parse BibTex using the handy dandy bibtexparser module import bibtexparser from bibtexparser.bparser import BibTexParser from bibtexparser.customization import convert_to_unicode parser = BibTexParser() parser.customization = convert_to_unicode parsed = bibtexparser.loads(bib, parser=parser).entries[0] # Miscellaneous clean up braces = re.compile(u'\{([A-Z_ \-]+|[\u0020-\uD7FF])\}', re.U) for key, val in parsed.iteritems(): val = braces.sub(r'\1', val) if '{' in val: raise Exception('Unhandled LaTeX: {}'.format(val.encode('cp1252'))) parsed[key] = val parsed['pages'] = parsed.get('pages', '').replace('--', '-') if parsed.get('publisher', '').endswith(')'): parsed['publisher'] = parsed['publisher'].rsplit('(', 1)[0].rstrip() #pp.pprint(parsed) return parsed
def bib2jekyllcol (inputFile, outputDir): "This prints the bibtex file to output directory as jekyll collection folder(s)" # read and parse bib file with open(inputFile) as bibtex_file: bibtex_str = bibtex_file.read() parser = BibTexParser() parser.customization = convert_to_unicode bib_database = bibtexparser.loads(bibtex_str, parser=parser) # create dictionary for transformation of month to number month_list = ["jan", "feb", "mar", "apr", "may", "june", "july", "aug", "sept", "oct", "nov", "dec"] # type names: type_list = ["title", "author", "journal", "volume", "number", "year", "month", "doi", "pages", "publisher", "booktitle", "note"] if not os.path.exists(outputDir): os.makedirs(outputDir) else: print("Deleting existing collection file...\n") for file in os.listdir(outputDir): file_path = os.path.join(outputDir, file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception, e: print e
def __init__(self, path, ads_cache=None): super(BibTexDB, self).__init__() self._filepath = path with open(path) as bibtex_file: bibtex_str = bibtex_file.read() self._db = bibtexparser.loads(bibtex_str) self._ads_cache = ads_cache
def normalize_keyword_delimitter(): for d in review.documents: bib = bibtexparser.loads(d.bib) if bib.entries[0].has_key('keyword'): bib.entries[0]['keyword'] = bib.entries[0]['keyword'].replace(';',',') d.bib = bibtexparser.dumps(bib) d.save()
def normalize_keyword_visualization(): for d in review.documents: bib = bibtexparser.loads(d.bib) if bib.entries[0].has_key('keyword'): bib.entries[0]['keyword'] = bib.entries[0]['keyword'].replace('visualis','visualiz') d.bib = bibtexparser.dumps(bib) d.save()
def parse_urlfile(url_file): """ take a file of the form category: ads url and get the bibtex from the URL and return a list of Paper objects with the category stored as the subject """ papers = [] with open(url_file) as f: parser = BibTexParser() parser.customization = customizations for line in f: if line.startswith("#") or line.strip() == "": continue subject, url = line.split(": ") # for the ADS bibtex URL, lop off the paper_id paper_id = url.strip().split("/")[-1] bibtex_url = "http://adsabs.harvard.edu/cgi-bin/nph-bib_query?bibcode={}&data_type=BIBTEX".format(paper_id) # get the bibtex in html -- this is a little tricky, since # urlopen gives us a byte object that we need to decode # into unicode before we can play with it. print(bibtex_url) with urllib.request.urlopen(bibtex_url) as response: bibtex_html = response.read() raw_bibtex_html = bibtex_html.splitlines() bibtex_string = "" for line in raw_bibtex_html: bibtex_string += "{}\n".format(line.decode("utf8")) # strip off any header and just leave the bibtex found_start = False bibtex = "" for line in bibtex_string: if line.startswith("@"): found_start = True if found_start: bibtex += line # parse the bibtex string bib_database = bibtexparser.loads(bibtex, parser=parser) for e in bib_database.entries: p = extract_paper_info(e) if not e is None: p.subject = subject papers.append(p) papers.sort(reverse=True) return papers
def test_multiple_string_parse(self): bibtex_str = '@string{name1 = "value1"}\n\n@string{name2 = "value2"}\n\n' bib_database = bibtexparser.loads(bibtex_str) expected = OrderedDict() expected['name1'] = 'value1' expected['name2'] = 'value2' self.assertEqual(bib_database.strings, expected)
def normalize(input_file, output_file): """ read a *.bib file, change every 'title' and 'booktitle' field to only use uppercase for the first letter and write the changes to the output file. Parameters ---------- input_file : file the *.bib file to normalized output_file : file the *.bib output file """ bibtex_str = input_file.read() bib_database = bibtexparser.loads(bibtex_str) for entry in bib_database.entries: for field in ('title', 'booktitle'): if field in entry: field_str = entry[field] # don't touch titles that are (partially) enclosed in brackets if (not FIXED_TITLE_RE.match(field_str) and not BRACKETS_RE.search(field_str)): if ':' in field_str: # split no more than once title, subtitle = field_str.split(':', 1) entry[field] = u'{}: {}'.format(title, subtitle.lower()) else: new_field_str = field_str.capitalize() entry[field] = new_field_str new_bibstr = bibtexparser.dumps(bib_database) output_file.write(new_bibstr.encode('utf-8'))
def merge_data(paper_list, scholar_data_list): # Merge yml data with google scholar data assert(len(paper_list) == len(scholar_data_list)) papers = [] for yaml_paper_info, scholar_data in zip(paper_list, scholar_data_list): paper = dict() # see __getitem__ of ScholarArticle attrs = dict([(key, scholar_data.attrs[key][0]) for key in scholar_data.attrs.keys()]) paper.update(attrs) if scholar_data.citation_data: paper['citation_data'] = scholar_data.citation_data print 'citation data %s' % scholar_data.citation_data bibdata = bibtexparser.loads(scholar_data.citation_data) bibinfo = bibdata.entries[0] paper.update(bibdata.entries[0]) else: print 'Warning: %s does not have citation_data' % yaml_paper_info['title'] paper.update(yaml_paper_info) # This should have the highest priority and overwrite others # if len(papers) == 0: # # Only do it once # print 'Scholar data field %s' % attrs.keys() # print 'Bib data fields %s' % bibinfo.keys() if paper.get('author'): paper['first_author'] = paper['author'].split('and')[0].strip() papers.append(paper) print 'Available data fields %s' % papers[0].keys() return papers
def save_citation(citation_record): cite_anchor = citation_record.find('a', {'class': 'gs_nph', 'href': '#', "role": "button"}) if not cite_anchor or not cite_anchor['onclick']: logging.warn("No Cite anchor for citation: %s" % citation_record) return citation_id = cite_anchor['onclick'].split(',')[1][1:-1] logging.info("Getting formated cite from citation id: " + citation_id) params = {"q": "info:%s:scholar.google.com/" % citation_id, "output": "cite"} soup = create_soup_by_url("https://scholar.google.com/scholar", params) bib_anchor = soup.find('a', {"class": "gs_citi"}) if not bib_anchor: logging.debug("BibTex page soup is: %s" % soup.getText()) logging.warn("No BibTex citation provided for citation: %s" % citation_id) return soup = create_soup_by_url(bib_anchor['href']) global citation_num citation_num += 1 # Adding a tag to the bib entry about google scholar citation ID citation_entry = bibtexparser.loads(soup.getText()).entries[0] citationID = citation_entry['ID'] # e.g., melville2004review citation_entry["gscholar_id"] = citation_id db_entry=[] db_entry.append(citation_entry) db = BibDatabase() db.entries = db_entry g_bib_entry = bibtexparser.dumps(db) bib_entry = "%% [%d]\n%s" % (citation_num, g_bib_entry) logging.info(bib_entry.strip()) with open(opts.citation_name, "a+") as f: f.write(bib_entry.encode('utf-8')) if opts.should_download: pdf_div = citation_record.find('div', {"class": "gs_ggs gs_fl"}) if pdf_div: download_pdf(pdf_div.a['href'], citationID)
def parseEntry(s): # normalize unicode by reparsing parser = BibTexParser() parser.customization = convert_to_unicode db1 = bibtexparser.loads(s, parser=parser) es = db1.entries return es[0]
def getsource(material): print('Grabbing MP BIB info for ' + material + '...') key = '0cVziFePTUfsawW8' url = 'https://www.materialsproject.org/materials/' + material + '/bibtex?API_KEY=' + key t=0 rbib = [] while t<4: try: r = requests.get(url) if r.status_code == 200: rbib = bibtexparser.loads(r.text).entries break else: print('error' + str(t)) t = t+1 except requests.ConnectionError: print('error' + str(t)) t = t+1 source = [] for entry in rbib: if entry['ID'] != 'MaterialsProject' and entry['ID'] != 'Bergerhoff1983' and entry['ID'] != 'Karlsruhe': try: source.append(entry) except KeyError: pass return source
def get_bibtex_dict(bib_fpath): r""" Args: bib_fpath (str): Returns: dict: bibtex_dict CommandLine: python -m utool.util_latex --test-get_bibtex_dict pip install bibtexparser Example: >>> # DISABLE_DOCTEST >>> from utool.util_latex import * # NOQA >>> import utool as ut >>> bib_fpath = ut.truepath('~/latex/crall-candidacy-2015/My_Library_clean.bib') >>> bibtex_dict = get_bibtex_dict(bib_fpath) >>> result = ('bibtex_dict = %s' % (str(bibtex_dict),)) >>> print(result) """ import bibtexparser import utool as ut bibtex_str = ut.readfrom(bib_fpath, verbose=False) bib_database = bibtexparser.loads(bibtex_str) bibtex_dict = bib_database.get_entry_dict() return bibtex_dict
def parse_volume_bib(self, response): """ Parses the volume bib page. :param category: in which the volume bib should be stored. :return: nothing but the bib page is stored. """ category = response.meta["category"] txt = response.body.decode("utf-8") volume = response.meta["volume_url"].split("/")[-1][: -4] # extract volume name from url if txt.startswith(u'\ufeff'): txt = txt[1:] if len(txt) == 0: logger.warning("empty volume bib on %s", response.url) request = scrapy.Request(response.meta["event_url"], callback=self.parse_event_page_precisely, dont_filter=True) request.meta["volume_url"] = response.meta["volume_url"] request.meta["category"] = category yield request else: bib_tex = bibtexparser.loads(txt) entries = bib_tex.entries # print file_name, len(entries) for bib in entries: bib["event"] = category bib["volume"] = volume self.insert(bib) self.num_volume_crawled += 1 self.num_paper_crawled += len(entries) self.db_mark_volume(response.url)
def updateArXiv(entry): """Look for new versions of arXiv entry `entry` Returns False if no new versions or not an arXiv entry, Returns the new bibtex otherwise. """ bibtex = getBibtex(entry) # Check arXiv if('archiveprefix' not in bibtex or 'arXiv' not in bibtex['archiveprefix']): return False arxiv_id = bibtex['eprint'] arxiv_id_no_v = re.sub(r'v\d+\Z', '', arxiv_id) ids = set(arxiv_id) for entry in getEntries(): if('archiveprefix' not in bibtex or 'arXiv' not in bibtex['archiveprefix']): continue ids.add(bibtex['eprint']) last_bibtex = bibtexparser.loads(fetcher.arXiv2Bib(arxiv_id_no_v)) last_bibtex = last_bibtex.entries_dict last_bibtex = last_bibtex[list(last_bibtex.keys())[0]] if last_bibtex['eprint'] not in ids: return last_bibtex else: return False
def load_database(self): ''' load bibtex file if needed ''' if any([not os.path.exists(self.args.output_path), self.args.overwrite == 'y']): self.args.bibtex_database = bibtexparser.loads('') else: self.args.bibtex_database = bibtexparser.load(open(args.output_path))
def fill(self, publication: Publication) -> Publication: """Populate the Publication with information from its profile :param publication: Scholar or Citation publication container object that is not filled :type publication: PublicationCitation or PublicationScholar """ if publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: url = _CITATIONPUB.format(publication['author_pub_id']) soup = self.nav._get_soup(url) publication['bib']['title'] = soup.find('div', id='gsc_oci_title').text if publication['bib']['title'][-1] == '\u2026': merged_snippet = soup.find('div', class_='gsc_oci_merged_snippet') if merged_snippet: title_div = merged_snippet.find('div') if title_div: publication['bib']['title'] = title_div.text if soup.find('a', class_='gsc_oci_title_link'): publication['pub_url'] = soup.find( 'a', class_='gsc_oci_title_link')['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_oci_field').text.strip().lower() val = item.find(class_='gsc_oci_value') if key == 'authors' or key == 'inventors': publication['bib']['author'] = ' and '.join( [i.strip() for i in val.text.split(',')]) elif key == 'journal': publication['bib']['journal'] = val.text elif key == 'conference': publication['bib']['conference'] = val.text elif key == 'volume': publication['bib']['volume'] = val.text elif key == 'issue': publication['bib']['number'] = val.text elif key == 'pages': publication['bib']['pages'] = val.text elif key == 'publisher': publication['bib']['publisher'] = val.text elif key == 'publication date': patterns = [ 'YYYY/M', 'YYYY/MM/DD', 'YYYY', 'YYYY/M/DD', 'YYYY/M/D', 'YYYY/MM/D' ] publication['bib']['pub_year'] = arrow.get( val.text, patterns).year elif key == 'description': # try to find all the gsh_csp if they exist abstract = val.find_all(class_='gsh_csp') result = "" # append all gsh_csp together as there can be multiple in certain scenarios for item in abstract: if item.text[0:8].lower() == 'abstract': result += item.text[9:].strip() else: result += item.text if len(abstract) == 0: # if no gsh_csp were found abstract = val.find(class_='gsh_small') if abstract: if abstract.text[0:8].lower() == 'abstract': result = abstract.text[9:].strip() else: result = abstract.text else: result = ' '.join( [description_part for description_part in val]) publication['bib']['abstract'] = result elif key == 'total citations': publication['cites_id'] = re.findall( _SCHOLARPUBRE, val.a['href'])[0].split(',') publication['citedby_url'] = _CITEDBYLINK.format(','.join( publication['cites_id'])) elif key == 'scholar articles': for entry in val.find_all('a'): if entry.text.lower() == 'related articles': publication['url_related_articles'] = entry.get( 'href')[26:] # number of citation per year years = [int(y.text) for y in soup.find_all(class_='gsc_oci_g_t')] cites = [int(c.text) for c in soup.find_all(class_='gsc_oci_g_al')] cites_year = [ int(c.get('href')[-4:]) for c in soup.find_all(class_='gsc_oci_g_a') ] nonzero_cites_per_year = dict(zip(cites_year, cites)) res_dict = {} for year in years: res_dict[year] = (nonzero_cites_per_year[year] if year in nonzero_cites_per_year else 0) publication['cites_per_year'] = res_dict if soup.find('div', class_='gsc_vcd_title_ggi'): publication['eprint_url'] = soup.find( 'div', class_='gsc_vcd_title_ggi').a['href'] publication['filled'] = True elif publication[ 'source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: bibtex_url = self._get_bibtex(publication['url_scholarbib']) bibtex = self.nav._get_page(bibtex_url) parser = bibtexparser.bparser.BibTexParser(common_strings=True) parsed_bib = remap_bib( bibtexparser.loads(bibtex, parser).entries[-1], _BIB_MAPPING, _BIB_DATATYPES) publication['bib'].update(parsed_bib) publication['filled'] = True return publication
def fetch_content(self): doi_url = self.doi_base_url + self.identifier page = self.get_page(doi_url, headers=self.headers) bibtex = bibtexparser.loads(page) return bibtex.entries[0]
# get list of immediate child subdirs SO:973473 : subdirs = sorted(next(os.walk(dir_data))[1]) #ok # 02_SMC Conference 2015:044/74: orig 'G. Presti and D.A. Mauro and G. Haus' -> _DATA_/02_SMC\ Conference\ 2015/smc_2015_044.pdf numcommas = 0 # homogenize_fields: Sanitize BibTeX field names, for example change `url` to `link` etc. tbparser = BibTexParser() tbparser.homogenize_fields = False # no dice tbparser.alt_dict[ 'url'] = 'url' # this finally prevents change 'url' to 'link' for subdir in subdirs: bibfile = os.path.join(dir_data, subdir, "%s.bib" % (subdir)) print((bibfile, os.path.isfile(bibfile))) with open(bibfile) as bibtex_file: bibtex_str = bibtex_file.read() bib_database = bibtexparser.loads(bibtex_str, tbparser) #pprint.pprint(bib_database.entries) # already here,replaces 'url' with 'link' confbiblen = len(bib_database.entries) for icpbe, confpaperbibentry in enumerate(bib_database.entries): authstr = confpaperbibentry['author'] if ("," in authstr): numcommas += 1 report = "%d/%d: Comma present: '%s'" % (icpbe + 1, confbiblen, authstr) authstrauthors = authstr.split(" and ") for ia, author in enumerate(authstrauthors): if ("," in author): authorparts = author.split(", ") # the first part [0] is last name, needs to become last # get and remove the first part, then append it as last lastname = authorparts.pop(0)
from utils import generate_md_file import bibtexparser import os file_name = str(os.path.join(os.getcwd(),'bibtex.bib')) with open(file_name) as bibtex_file: bibtex_str = bibtex_file.read() bib_db = bibtexparser.loads(bibtex_str, parser=bibtexparser.bparser.BibTexParser(ignore_nonstandard_types=False)) ################################### Create Readme #################################### def plot_titles(titles): return '\n' + "## " + titles[0] + '\n' list_types = [["Classics", "Classic"], ["Empirical Study", "Empirical"], ["Surveys", "Survey", "survey"], ["Influentials", "Influential"], ["New Settings or Metrics", "Setting", "Metric"], ["Regularization Methods", "Regularization"], ["Distillation Methods", "Distillation"], ["Rehearsal Methods", "Rehearsal"], ["Generative Replay Methods", "Generative Replay"], ["Dynamic Architectures or Routing Methods", "Architectures", "Dynamic Architecture"], ["Hybrid Methods", "Hybrid"], ["Continual Few-Shot Learning", "Continual-Meta Learning"], ["Meta-Continual Learning"], ["Lifelong Reinforcement Learning", "Reinforcement"],
@STRING{ aug = "aug"} @STRING{ sep = "sep"} @STRING{ oct = "oct"} @STRING{ nov = "nov"} @STRING{ dec = "dec"} """ print 'Parsing files in ' + folder + '/' for file in os.listdir(folder): if file.endswith(".bib"): print(os.path.join(folder, file)) with open(os.path.join(folder, file)) as bibtex_file: content = Months + bibtex_file.read() parser = BibTexParser() parser.common_strings = True bib_database = bibtexparser.loads(content, parser) for entry in bib_database.entries: #print(entry['ID']) entry['keywords'] = entry.get('keywords', '') if (entry['keywords'] != ''): entry['keywords'] = 'cleBib/' + entry[ 'ID'] + ', article/' + os.path.splitext( file)[0] + ', ' + entry['keywords'] else: entry['keywords'] = 'cleBib/' + entry[ 'ID'] + ', article/' + os.path.splitext(file)[0] with open(os.path.join(folder + '-clean', file), 'w') as bibtex_export: bibtex_export_str = bibtexparser.dumps(bib_database, writer) bibtex_export.write(bibtex_export_str.encode('utf8'))
def test_multiple_string_parse_count(self): bibtex_str = '@string{name1 = "value1"}\n\n@string{name2 = "value2"}\n\n' bib_database = bibtexparser.loads(bibtex_str) self.assertEqual(len(bib_database.strings), 2)
def fill(self): """Populate the Publication with information from its profile""" if self.source == 'citations': url = _CITATIONPUB.format(self.id_citations) soup = self.nav._get_soup(url) self.bib['title'] = soup.find('div', id='gsc_vcd_title').text if soup.find('a', class_='gsc_vcd_title_link'): self.bib['url'] = soup.find( 'a', class_='gsc_vcd_title_link')['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_vcd_field').text.strip().lower() val = item.find(class_='gsc_vcd_value') if key == 'authors': self.bib['author'] = ' and '.join( [i.strip() for i in val.text.split(',')]) elif key == 'journal': self.bib['journal'] = val.text elif key == 'volume': self.bib['volume'] = val.text elif key == 'issue': self.bib['number'] = val.text elif key == 'pages': self.bib['pages'] = val.text elif key == 'publisher': self.bib['publisher'] = val.text elif key == 'Publication date': patterns = [ 'YYYY/M', 'YYYY/MM/DD', 'YYYY', 'YYYY/M/DD', 'YYYY/M/D', 'YYYY/MM/D' ] self.bib['year'] = arrow.get(val.text, patterns).year elif key == 'description': # try to find all the gsh_csp if they exist abstract = val.find_all(class_='gsh_csp') result = "" # append all gsh_csp together as there can be multiple in certain scenarios for item in abstract: if item.text[0:8].lower() == 'abstract': result += item.text[9:].strip() else: result += item.text if len(abstract) == 0: # if no gsh_csp were found abstract = val.find(class_='gsh_small') if abstract: if abstract.text[0:8].lower() == 'abstract': result = abstract.text[9:].strip() else: result = abstract.text else: result = ' '.join( [description_part for description_part in val]) self.bib['abstract'] = result elif key == 'total citations': self.bib['cites_id'] = re.findall(_SCHOLARPUBRE, val.a['href'])[0] self.citations_link = _CITEDBYLINK.format( self.bib['cites_id']) # number of citation per year years = [int(y.text) for y in soup.find_all(class_='gsc_vcd_g_t')] cites = [int(c.text) for c in soup.find_all(class_='gsc_vcd_g_al')] self.cites_per_year = dict(zip(years, cites)) if soup.find('div', class_='gsc_vcd_title_ggi'): self.bib['eprint'] = soup.find( 'div', class_='gsc_vcd_title_ggi').a['href'] self._filled = True elif self.source == 'scholar': bibtex_url = self._get_bibtex(self.url_scholarbib) bibtex = self.nav._get_page(bibtex_url) parser = bibtexparser.bparser.BibTexParser(common_strings=True) self.bib.update(bibtexparser.loads(bibtex, parser).entries[-1]) self._filled = True return self
def main(): local_dir = os.getcwd() # 指定tex源文件的路径 parser = argparse.ArgumentParser() parser.add_argument('-t', '--tex', help='the path of tex file') parser.add_argument('-o', '--output', help='the path of bib file you are using for latex. By default the current path') args = parser.parse_args() tex_files = args.tex.replace(' ', '').split(',') if args.tex else [os.path.join(local_dir, f) for f in get_tex_file(local_dir) ] # 如未给出,则在当前路径中寻找tex文件 bib_keys = [] bib_name = None # todo 不能处理多个bib_name,不过一般不存在这种情况,只有main.tex中会有这个命令 for f in tex_files: key, temp_name = get_bibinfo(f) # 获取bibkey和bib文件 bib_keys.extend(key) if temp_name: bib_name = temp_name bib_dir = os.path.split(f) tex_dir = bib_dir if args.tex else local_dir # 分离texfile的路径和文件 bib_name = os.path.join(tex_dir, bib_name) # 拼接路径,指向tex相同路径下 output_bib = args.output if args.output else bib_name # 有命令行参数则选为参数,否则使用tex文件中指定的名称,放在相同路径下 # 从zotero的API中读取数据 try: r = requests.get(ZOTERO_API) except requests.exceptions.ConnectionError: print('zotero未启动,获取数据库失败') sys.exit(1) if r.status_code == 200: print('成功从zotero读取数据') else: raise Exception('未能从zotero读取数据,状态码:{}'.format(r.status_code)) sys.exit(1) r.encoding = 'utf-8' bib_str = modify_bibs(r.text) # with open('./bib_str.txt', 'w', encoding='utf8') as out_bib: # out_bib.write(bib_str) # 构建BibtexParser bibParser = BibTexParser(common_strings=False) bibParser.ignore_nonstandard_types = True bibParser.homogenise_fields = True bibdata = bp.loads(bib_str, bibParser) # for i in range(100,120): # print(bibdata.entries[i]) # print(type(bibdata.entries[i]), '\n') # 对bib库进行格式处理 # 此处效率低,应该直接从大库里读bib id,存在则append,否则,报错 bibdata_out = bp.bibdatabase.BibDatabase() for d in bibdata.entries: if d['ID'] in bib_keys: bibdata_out.entries.append(d) entity_check = check_entity(d) entity_check_consequence = '---->题目:'+ re.sub(r'[{}]','', d['title']) +' 缺少字段:'+ str(entity_check) if entity_check else '' print('成功导入---->'+d['ID'], entity_check_consequence) bib_keys.remove(d['ID']) # TODO # 检查导入失败的是否在被引用的其它bib文件里 bibkey_not_found = '\n'.join(bib_keys) print('以下导入失败(共{}个):\n'.format(len(bib_keys)), bibkey_not_found) print('------------end---------------') # print(bibdata_out) with open(output_bib, 'w', encoding='utf8') as bib_write: bp.dump(bibdata_out, bib_write)
def test_single_string_parse(self): bibtex_str = '@string{name1 = "value1"}\n\n' bib_database = bibtexparser.loads(bibtex_str) expected = {'name1': 'value1'} self.assertEqual(bib_database.strings, expected)
def __init__(self, j): """ Constructor. @param [in] j JSON representation of the case study. """ ## The user community name. self.__full_user_community_name = j['full_name'] ## The label. self.__label = j['label'] ## The sector. self.__sector = j['sector'] ## The raw overview text. self.__overview_raw = j['overview'] ## The raw "The problem" text. self.__the_problem_raw = j['the_problem'] ## The raw "The solution" text. self.__the_solution_raw = j['the_solution'] ## The "What they said" raw quote text. self.__what_they_said = None # ## Who said it? self.__who_said_it = None # if 'what_they_said' in j.keys(): self.__what_they_said = j['what_they_said']['quote'] self.__who_said_it = j['what_they_said']['contact'] ## A dictionary of the supporting sites. self.__sites = {} # for site in j['supporting_sites']: self.__sites[site.keys()[0]] = site.values()[0] ## A dictionary of services. self.__services = {} # for service in j['services']: self.__services[service.keys()[0]] = service.values()[0] ## A dictionary of Virtual Organisations (VOs). self.__vos = {} # if 'vos' in j.keys(): for vo in j['vos']: my_vo = VirtualOrganisation(vo) self.__vos[my_vo.get_name()] = my_vo ## The acknowledgements raw text. self.__acknowledgements = None # if 'acknowledgements' in j.keys(): self.__acknowledgements = j['acknowledgements'] ## Dictionary of the hyperlinks. self.__links = {} # for link in j['links']: self.__links[link.keys()[0]] = link.values()[0] ## A dictionary of figures. self.__figures = {} # for fig in j['figures']: self.__figures[fig['label']] = Figure(fig) # Get the BibTeX items from the BibTeX file. with open("common/bib/GridPP.bib", 'r') as bibtex_file: bibtex_str = bibtex_file.read() ## The BibTeX database. bib_database = bibtexparser.loads(bibtex_str) lg.info(" *") lg.info(" * Number of entries in the BibTeX file: %d" % (len(bib_database.entries))) lg.info(" *") ## A dictionary of the papers. papers = {} # Get the papers (and check whether the PDF is there). for entry in bib_database.entries: if entry['ENTRYTYPE'] == 'article': paper = Paper(entry) papers[paper.get_id()] = paper ## A dictionary of publications used in the case study. self.__papers = {} # if 'references' in j.keys(): for p in j['references']: # Get the paper. citecode = p.keys()[0] if citecode in papers.keys(): self.__papers[citecode] = papers[citecode]
def prase_wid(): a = bibtexparser.loads(str(textedit.toPlainText())) if len(a.entries) == 0: QtWidgets.QMessageBox.critical(self, "Wrong Prasing", "Wrong Bibtex reference.") return False d = a.entries[0] if not 'file' in d: QtWidgets.QMessageBox.critical( self, "Wrong Prasing", "Bibtex do not contain the file.") return False if not ('year' in d or 'date' in d): QtWidgets.QMessageBox.critical( self, "Wrong Prasing", "Bibtex do not contain the year or the date.") return False if not 'author' in d: QtWidgets.QMessageBox.critical( self, "Wrong Prasing", "Bibtex do not contain the author.") return False if not 'title' in d: QtWidgets.QMessageBox.critical( self, "Wrong Prasing", "Bibtex do not contain the title.") return False if 'year' in d: y = d['year'] else: tmp = re.findall('[0-9][0-9][0-9][0-9]', d['date']) if len(tmp) == 0: QtWidgets.QMessageBox.critical( self, "Wrong Prasing", "Bibtex do not understand the year format of " "the date") else: y = tmp[0] authors = d["author"].split(" and ") authors = [a.split(',') for a in authors] authors = [[aa.strip() for aa in a] for a in authors] short = authors[0][0] + y authors_str = [] for a in authors: new_a = a[0] if len(a) >= 2: for aa in a[1:]: new_a += ' ' + ' '.join([ firstname[0].upper() + '.' for firstname in aa.split(" ") ]) authors_str.append(new_a) if len(authors_str) >= 5: authors_str = authors_str[:4] + ['...'] + [authors_str[-1]] authors_str = ", ".join(authors_str) print("authors_str", authors_str) title = d['title'] title = title.replace('{', '') title = title.replace('}', '') files = d['file'].split(';') motif = ':application/pdf' for i, f in enumerate(files): if f.endswith(motif): files[i] = f[:-len(motif)] files = filter(lambda f: f.endswith('.pdf') or f.endswith('.PDF'), files) files = list(files) if len(files) == 0: QtWidgets.QMessageBox.critical( self, "Wrong Prasing", "No pdf file detected in bibtex file.") return False elif len(files) == 1: file = files[0] if onWindows: file = file.replace(r'C\:', 'C:') # on windows file = file.replace(r'\\', '\\') # on windows else: item = QtWidgets.QInputDialog.getItem(textedit, 'Choose pdf file', 'File: ', files, 0, False) if not item[1]: return False file = str(item[0]) m = file.find(':') if m < -1: QtWidgets.QMessageBox.critical( self, "Wrong Prasing", "No pdf file detected in bibtex file.") return False file = file[m + 1:] notes = "" if 'annote' in d: notes = d['annote'] wid.close() self.short_edit.setText(short) self.authors_edit.setText(authors_str) self.title_edit.setText(title) self.file_edit.setText(file) self.notes_edit.setText(notes)
def fill(self): """Populate the Publication with information from its profile""" if self.source == 'citations': url = _CITATIONPUB.format(self.id_citations) soup = self.nav._get_soup(url) self.bib['title'] = soup.find('div', id='gsc_vcd_title').text if soup.find('a', class_='gsc_vcd_title_link'): self.bib['url'] = soup.find( 'a', class_='gsc_vcd_title_link')['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_vcd_field').text.strip().lower() val = item.find(class_='gsc_vcd_value') if key == 'authors': self.bib['author'] = ' and '.join( [i.strip() for i in val.text.split(',')]) elif key == 'conference': self.bib['conference'] = val.text elif key == 'journal': self.bib['journal'] = val.text elif key == 'book': self.bib['book'] = val.text elif key == 'volume': self.bib['volume'] = val.text elif key == 'issue': self.bib['number'] = val.text elif key == 'pages': self.bib['pages'] = val.text elif key == 'publisher': self.bib['publisher'] = val.text elif key == 'Publication date': patterns = ['YYYY/M', 'YYYY/MM/DD', 'YYYY', 'YYYY/M/DD', 'YYYY/M/D', 'YYYY/MM/D'] self.bib['year'] = arrow.get(val.text, patterns).year elif key == 'description': if val.text[0:8].lower() == 'abstract': val = val.text[9:].strip() abstract = val.find(class_='gsh_csp') if abstract is None: abstract = val.find(class_='gsh_small') self.bib['abstract'] = abstract.text ## elif key == 'total citations': ## self.bib['cites'] = re.findall( ## _SCHOLARPUBRE, val.a['href'])[0] elif key == 'total citations': m=re.search('by (.*?)<',str(val)) self.bib['cites'] = m.group(1) # number of citation per year years = [int(y.text) for y in soup.find_all(class_='gsc_vcd_g_t')] cites = [int(c.text) for c in soup.find_all(class_='gsc_vcd_g_al')] self.cites_per_year = dict(zip(years, cites)) if soup.find('div', class_='gsc_vcd_title_ggi'): self.bib['eprint'] = soup.find( 'div', class_='gsc_vcd_title_ggi').a['href'] self._filled = True elif self.source == 'scholar': bibtex = self.nav._get_page(self.url_scholarbib) self.bib.update(bibtexparser.loads(bibtex).entries[0]) self._filled = True return self
def bib_pars(item1): bib_database = bibtexparser.loads(item1) return (bib_database.entries[0])
import pyperclip # dependency, need to pip install # import getPDF_url as gpdf import codecs # get citation from clipboard # we assume it is in valid bibtex # we assume has title, authors, year, and publication; lazy for now, should add edge cases later r = Tk() r.withdraw() clip_text = r.clipboard_get() # parse the bibtex # need to define a parser with custom settings bc zotero has nonstandard bibtex items like "jan" for month # per https://github.com/sciunto-org/python-bibtexparser/issues/192 parser = BibTexParser(common_strings=True) bib = bibtexparser.loads(clip_text, parser) entries = bib.entries # print(entry) print(f"Processing {len(entries)} entries") for entry in entries: # parse title print(entry) title = entry['title'].replace("{", "").replace("}", "").replace("\n", " ") # build author string authors = [] for author in entry['author'].split(" and "): author = author.strip().replace("\n", " ").split(",") authors.append("[[%s %s]]" %(author[-1].strip(), author[0].strip()))
def save_titles(bibtex_file, username, password): # read bibtex file with open(bibtex_file) as f: bibtex_str = f.read() bib_database = bibtexparser.loads(bibtex_str) entries = bib_database.entries # connect to Arxiv Sanity driver = webdriver.PhantomJS() driver.get('http://www.arxiv-sanity.com') # login username_elem = driver.find_element_by_name("username") password_elem = driver.find_element_by_name("password") username_elem.send_keys(username) password_elem.send_keys(password) driver.find_element_by_css_selector(".btn-fancy").click() # search for the title of each BibTeX entry for e, entry in enumerate(entries): time.sleep(5) title = entry['title'] print('-' * 100) print('%.0f%% | BibTeX title: %s' % (100. * (e + 1) / len(entries), title)) qfield = driver.find_element_by_id('qfield') qfield.clear() qfield.send_keys(title) qfield.send_keys(Keys.ENTER) papers = driver.find_elements_by_class_name('apaper') imgs = driver.find_elements_by_class_name('save-icon') assert len(imgs) == len(papers) if len(imgs) == 0: print('No search results') continue site_titles = [] for paper in papers: site_title = paper.find_element_by_class_name( 'paperdesc').find_element_by_tag_name('a').get_attribute( 'text') site_titles.append(site_title) distances = [ editdistance.eval(title, site_title) for site_title in site_titles ] if min(distances) > 10: print('No match found within threshold, closest was: %s' % site_titles[i]) continue i = distances.index(min(distances)) img = imgs[i] src = img.get_attribute('src') if src.endswith('saved.png'): print('Paper already saved') continue img.click() print('Saved paper with title: %s' % site_titles[i])
import os import bibtexparser import django from datetime import datetime os.environ.setdefault("DJANGO_SETTINGS_MODULE", "document_management.settings") django.setup() from document.models import Document, Author, Field, Publisher, Project with open('/home/each/Documents/document_management/static/test/ml.bib', 'r') as bibfile: bibstring = bibfile.read() bd = bibtexparser.loads(bibstring) docs = bd.entries pro = Project(name="deep learning study", description="this is project is used for deep learning study") pro.save() for doc in docs: if "title" in doc: title = doc['title'].replace("{", "").replace("}", "") try: document = Document.objects.get(title=title) print "sucess" except: print "except" # exit()
def dashboard_publications(request): all_journal = JournalImage.objects.all() print(all_journal) all_publications = Publication.objects.all() context = {'all_journal': all_journal, 'all_publications': all_publications} if request.method == 'POST': if 'journal' in request.POST: submitted_form = AddEditJournalForm(request.POST, request.FILES) if submitted_form.is_valid(): submitted_form.save() return redirect(reverse('dashboard_publications')) else: messages.error(request, submitted_form.errors) context['journal_form'] = submitted_form return render(request, 'website/dashboard_publications.html', context) if 'manual' in request.POST: submitted_form = AddEditPublicationForm(request.POST, request.FILES) if submitted_form.is_valid(): submitted_form.save() return redirect(reverse('dashboard_publications')) else: messages.error(request, submitted_form.errors) context['form'] = submitted_form return render(request, 'website/dashboard_publications.html', context) elif 'bibtex' in request.POST: bibtex_entered = request.POST.get('bibtex') try: bib_parsed = bibtexparser.loads(bibtex_entered) bib_info = bib_parsed.entries[0] if 'title' in bib_info: title = bib_info['title'] else: title = None if 'author' in bib_info: authors = bib_info['author'] elif 'authors' in bib_info: authors = bib_info['aithors'] else: authors = None if 'url' in bib_info: url = bib_info['url'] elif 'link' in bib_info: url = bib_info['link'] elif 'doi' in bib_info: url = "http://dx.doi.org/" + bib_info['doi'] else: url = None if title and authors and url: publication_obj = Publication(title=title, author=authors, url=url) if 'ENTRYTYPE' in bib_info: publication_obj.entry_type = bib_info['ENTRYTYPE'] if 'doi' in bib_info: publication_obj.doi = bib_info['doi'] if 'journal' in bib_info: publication_obj.published_in = bib_info['journal'] if 'booktitle' in bib_info: publication_obj.published_in = bib_info['booktitle'] if 'publisher' in bib_info: publication_obj.publisher = bib_info['publisher'] if 'year' in bib_info: publication_obj.year_of_publication = bib_info['year'] if 'month' in bib_info: publication_obj.month_of_publication = bib_info['month'] publication_obj.bibtex = bibtex_entered publication_obj.save() return redirect(reverse('dashboard_publications')) else: return render(request, 'website/dashboard_publications.html', context) except Exception as e: messages.error(request, str(e)) return render(request, 'website/dashboard_publications.html', context) else: raise Http404("Not a valid method for adding publications.") journal_form = AddEditJournalForm() form = AddEditPublicationForm() context['form'] = form context['journal_form'] = journal_form return render(request, 'website/dashboard_publications.html', context)
def test_multiple_preamble_parse(self): bibtex_str = '@preamble{" a "}\n\n@preamble{"b"}\n\n' bib_database = bibtexparser.loads(bibtex_str) expected = [' a ', 'b'] self.assertEqual(bib_database.preambles, expected)
def test_single_preamble_parse_count(self): bibtex_str = '@preamble{" a "}\n\n' bib_database = bibtexparser.loads(bibtex_str) self.assertEqual(len(bib_database.preambles), 1)
for it in (db.entries): title = it['title'] id = it['ID'] title = title.strip('{}') succ = False while not succ: try: res = gscholar.query(title) time.sleep(10) succ = True except Exception as e: print(e) # sleep(10) break if not succ: break it_gs = bibtexparser.loads(res[0]) it_gs = it_gs.entries[0] # from IPython import embed; embed() it_gs['ID'] = id ress.append(it_gs) print(it_gs) # break from bibtexparser.bwriter import BibTexWriter from bibtexparser.bibdatabase import BibDatabase db = BibDatabase() db.entries = ress writer = BibTexWriter()
def _parse_bibtex(bib): return bibtexparser.loads(bib).entries
def bibstr2dict(bibstr): bibdict = bibtexparser.loads(bibstr) return bibdict.entries
def readDBs(self, Directory): with open(Directory + os.sep + self.mPapersDBFileNames) as bibtex_file: bibtex_str = bibtex_file.read() self.mPapersDB = bibtexparser.loads(bibtex_str)
# -*- coding: utf-8 -*- # Copyright (c) Ezcad Development Team. All Rights Reserved. import matplotlib.pyplot as plt import bibtexparser fn = "mcmechan.bib" with open(fn) as bibtex_file: bibtex_str = bibtex_file.read() bdb = bibtexparser.loads(bibtex_str) year_pubs = {} for entry in bdb.entries: year = entry['year'] if year not in year_pubs: year_pubs[year] = 1 else: year_pubs[year] += 1 fig, ax = plt.subplots(1, 1) fig.set_size_inches(10, 6) ax.grid(zorder=0) ax.bar(year_pubs.keys(), year_pubs.values(), color='g', zorder=3) ax.set_ylabel('Number of Papers') plt.xticks(rotation='vertical') # plt.show() fn = "year_pubs.png" plt.savefig(fn) plt.close()
import bibtexparser # Import PURE fname = 'all.bib' parser = bibtexparser.bparser.BibTexParser(common_strings=True) with open(fname, encoding="utf8") as bibtex_file: bibtex_str = bibtex_file.read() bib_pure = bibtexparser.loads(bibtex_str, parser=parser) print(fname + ' contains ', len(bib_pure.entries), ' entries') # Import WEBSITE fname = 'website_export.bib' parser2 = bibtexparser.bparser.BibTexParser(common_strings=True) with open(fname, encoding="utf8") as bibtex_file2: bibtex_str2 = bibtex_file2.read() bib_website = bibtexparser.loads(bibtex_str2, parser=parser2) print(fname + ' contains ', len(bib_website.entries), ' entries') # Results mavlab_missing = bibtexparser.bibdatabase.BibDatabase() mavlab_merged = bibtexparser.bibdatabase.BibDatabase() verbose = False def cleanup_title(txt): txt = txt.replace('{', '').replace('}', '').replace('¿',
bib = re.sub("\\\\enquote{(.+?)}", r"»\1«", bib) # Latex-Quotation austauschen bib = re.sub("\\\\emph{(.+?)}", r"\1", bib) # Latex-Quotation austauschen bib = re.sub("„", "»", bib) # Anführungszeichen austauschen bib = re.sub("“", "«", bib) # Abführungszeichen austauschen bib = re.sub("--", "–", bib) # Bindestriche austauschen bib = re.sub("~", " ", bib) # Gesicherte Spaces austauschen bib = re.sub("\\\\&", "&", bib) # Und-Zeichen austauschen bib = re.sub("%.+?@", "@", bib, flags=re.MULTILINE | re.DOTALL) # %-Umgebung aus dial entfernen # BIBTEX parsen und nach JS exportieren: import bibtexparser from bibtexparser.bparser import BibTexParser parser = BibTexParser() parser.ignore_nonstandard_types = False bibdb = bibtexparser.loads(bib, parser) with open("dial.js", "w") as fh: fh.write('var rows = [\n') for entry in bibdb.entries: fh.write(' {\n') # Wenn keywords == wenn Erstpublikation: if 'keywords' in entry: entry['Erstpublikation'] = 'ja' else: entry['Erstpublikation'] = 'nein' # Genre eintragen if 'keywords' not in entry: # Genre aus dem Werk-Element mit selber wikidata-ID holen: for entry2 in bibdb.entries: if 'keywords' in entry2 and entry2['wikidata'] == entry[
with open(args.infile) as input_file: bib_database = bibtexparser.load(input_file) keys_done = [] dblp_entries = [] non_dblp_entries = [] num_skipped = 0 for entry in bib_database.get_entry_list(): id = entry['ID'] if not (id in keys_done): if id[0:4] == "DBLP": print("downloading " + id) bib_str = download_dblp(id[5:]) temp_db = bibtexparser.loads(bib_str) dblp_entries.append(temp_db.entries[0]) else: non_dblp_entries.append(entry) keys_done.append(id) else: num_skipped += 1 print(id + " skipped") print("#DBLP entries = " + str(len(dblp_entries))) print("#non DBLP entries = " + str(len(non_dblp_entries))) print("#entries skipped = " + str(num_skipped)) print("writing new bib...", end="")
def tex_to_word(tex_fn, repo_dir, bib_fn=None): r"""Convert a LaTeX formatted file to docx format Parses ``tex_fn`` and converts text and some markup tags and environments into Word constructs. Creates a file with the same basename as ``tex_fn`` but with ``.docx`` extension, e.g. ``main.tex -> main.docx``. If ``bib_fn`` is not provided, all ``\cite`` tags are replaced by parentheses, leaving keys as is. If ``bib_fn`` is provided, all ``\cite`` tags are replaced by <Author> <Year> formatted references, and if a ``\bibliography`` tag is present, a Reference section is formatted at the end of the document. :param tex_fn: path to LaTeX formatted file :param bib_fn: optional path to BibTeX formatted file containing citation information :return: nothing """ print( '\n-------------------------------------------------------------------' ) print(tex_fn) with open(tex_fn) as f: tex = f.read() parsed = lexer.input(tex) bibdb = None if bib_fn: with open(bib_fn) as f: bibtex_str = f.read() if len(bibtex_str) > 0: bibdb = bibtexparser.loads(bibtex_str) bibdb = {_['ID']: _ for _ in bibdb.entries} def is_heading(args): return 'section' in args def get_heading_level(args): return args.count('sub') + 1 heading_level = 1 in_doc = False # in_section is a stack of nested \begin{XXX} commands # useful for nested elements like itemize, enumerate, etc in_section = [] prev_token = None doc = docx.Document() text_started = False refs = set() words = [] while True: tok = lexer.token() if not tok: break # handle commands, which control the structure of the document # and special elements like tables and figures (not yet implemented) if tok.type == 'COMMAND': if tok.command == 'title': doc.add_heading(tok.args, 0) elif tok.command == 'begin': # don't insert anything until we have seen a begin{document} if tok.args == 'document': in_doc = True # other \begin's to be supported: # table # tabular # figure in_section.append(tok.args) elif tok.command == 'end': in_section.pop() elif tok.command == 'item': style = None level = len(in_section) if level == 1: print( 'saw \\item outside of command, I dont know what to ' 'do so I will very cowardly ignore token:', tok.command, tok.opts, tok.args, tok.post_opts, tok.rest) elif in_section[-1] == 'itemize': style = 'List Bullet' elif in_section[-1] == 'enumerate': style = 'List Number' else: print( 'saw \\item inside a command I dont recognize, ' 'I dont know what to do so I will very cowardly ' 'ignore token:', tok.command, tok.opts, tok.args, tok.post_opts, tok.rest) if style is not None: if level > 2: style += ' {}'.format(level - 1) doc.add_paragraph(tok.rest.strip(), style=style) # \section, \subsection, \subsubsection, etc elif is_heading(tok.command): if words: add_paragraph(doc, words) words = [] heading_level = get_heading_level(tok.command) doc.add_heading(tok.args, heading_level) # insert citation text elif tok.command == 'cite': ref_strs = tok.args if bibdb: refids = tok.args.split(',') refids = [_ for _ in refids if _] refs.update(set(refids)) ref_strs = [] for refid in refids: entry = bibdb[refid] author = entry.get('author', entry.get('title', '')).split(',')[0] year = entry.get('year', '') ref_strs.append(' '.join([author, year])) ref_strs = ','.join(ref_strs) citation = Text(text=''.join(['(', ref_strs, ')']), type='cite', style=None, props=None) words.append(citation) elif tok.command == 'clearpage': doc.add_paragraph().add_run().add_break(WD_BREAK.PAGE) elif tok.command == 'includegraphics': pic_path = os.path.join(repo_dir, tok.args) img = Image.open(pic_path) # calculate the image width in inches assuming 72 dpi # maximum 6 inches dpi = 72 img_width = min(img.size[0] / 72, 6) doc.add_picture(pic_path, width=Inches(img_width)) elif tok.command == 'newline': words.append(Word(text='\n')) else: print('unrecognized command:', tok.command, tok.opts, tok.args, tok.post_opts) if tok.type == 'EQUATION': print('found an equation, dont know how to do those yet', tok.value) # regular text word if tok.type == 'WORD': # replace escaped chars with literal chars tok.value = tok.value.replace(r'\%', '%') tok.value = tok.value.replace(r'\$', '$') text_started = True text = Word(text=tok.value) words.append(text) if tok.type == 'NEWLINE': # if we hit two newlines in a row, create a new paragraph if prev_token and prev_token.type == 'NEWLINE' and text_started: add_paragraph(doc, words) words = [] if tok.type == 'MANUALNEWLINE': words.append(Word(text='\n')) if tok.type == 'TEXTFMT': if tok.command == 'textbf': bold = Text(text=tok.args, type='textbf', style=None, props={'bold': True}) words.append(bold) if tok.command == 'textit': italic = Text(text=tok.args, type='textit', style=None, props={'italic': True}) words.append(italic) prev_token = tok # do refs if there are refs if refs: doc.add_heading('References', heading_level) refs = sorted(list(refs)) for i, refid in enumerate(refs): ref = bibdb[refid] author = '' if 'author' in ref: author = ref['author'].split(' and ') author = author[0] + u' et al. ' title = (tou(ref.get('title', '')).replace('{', '').replace( '}', '').replace('\n', ' ')) ref_words = [ Word(text='{}. '.format(i + 1)), Word(text=author), Word(text=title + u'. ') ] def fmt(key, pref='', suff=''): if key in ref: return Word(tou(pref + ref[key] + suff)) ref_words.extend([ fmt('journal', suff=u'. '), fmt('booktitle', suff=u'. '), fmt('volume', suff=u', '), fmt('pages', suff=u' '), fmt('year', pref=u'(', suff=u')'), fmt('howpublished', pref=u'(', suff=u')'), fmt('note'), Word(text=u'.') ]) ref_words = [_ for _ in ref_words if _] add_paragraph(doc, ref_words) """ [{'journal': 'Nice Journal', 'comments': 'A comment', 'pages': '12--23', 'month': 'jan', 'abstract': 'This is an abstract. This line should be long enough to test\nmultilines...', 'title': 'An amazing title', 'year': '2013', 'volume': '12', 'ID': 'Cesar2013', 'author': 'Jean César', 'keyword': 'keyword1, keyword2', 'ENTRYTYPE': 'article'}] """ # write out the doc basedir = os.path.dirname(tex_fn) basename, ext = os.path.splitext(os.path.basename(tex_fn)) doc_fn = os.path.join(basedir, '{}.docx'.format(basename)) doc.save(doc_fn)
#!/usr/bin/env python import sys import bibtexparser from collections import Counter import matplotlib.pyplot as plt filename = sys.argv[1] text = open(filename).read() bib = bibtexparser.loads(text).entries years = [int(e['year']) for e in bib] counts = Counter(years) keys = list(counts.keys()) for i in range(min(keys), max(keys) + 1): counts[i] += 0 print('{},{}'.format(i, counts[i])) plt.hist(years, bins=1 + max(keys) - min(keys)) plt.title("Publications by Year") plt.savefig('years.png')
title = ' '.join(new_words) w1 = work.works(query_title=title, #query_author=inauthor, select=['title','DOI'], cursor="*", limit=1000, sort='relevance', order = "desc" ) w2 = [] for z in w1: w2 = w2 + [item for item in z['message']['items']] return w2, new_words # Retrieving the list of BibTex-formatted publications bibtext = pd.read_csv('bibtext.csv', encoding='ISO-8859-1') for k,row in bibtext.iterrows(): bib = bibtexparser.loads(row[0], parser) outdf = pd.DataFrame() # Searching the DOIs of each publication in the input bibtext file and returning # them as a CSV file for bibitem in bib.entries: intitle = bibitem['title'] inauthor = bibitem['author'].split()[0] inauthor = inauthor.replace(',','') w, words = doisearcher(intitle, inauthor) for item in w: titleword = item['title'][0].split() titleword = [x.lower() for x in titleword] if all(x.lower() in titleword for x in words): newrow = pd.Series([bibitem['ID'], intitle,item['title'][0],item['DOI']], index=['ID','orig_title','doi_title','doi'])