def _check_paper(paper): "Check required attributes" # TODO: Do this during parsing, so we can give error messages with line number and text print("Found paper: %s" % pstr(paper)) invalid = False # Check that paper has all required attributes entry_type = paper["entrytype"] key = paper["key"] attributes = config.get("entrytype_attributes")[entry_type] for attribute in attributes: # Check if the required field is a tuple and at least one field is used if isinstance(attribute, tuple): if not len([f for f in attribute if f in paper]) >= 1: print(' Missing required attribute(s) "%s" for paper "%s"' % ('"/"'.join(attribute), key)) invalid = True elif not attribute in paper: print(' Missing required attribute "%s" for paper "%s"' % (attribute, key)) invalid = True if invalid: paper["invalid"] = True print( " Skipping paper. Correct the above error(s) and import the paper again." ) if not config.get("autofix"): input(" Press return to continue.")
def print_summary(papers, num_found=0, num_missing=0): "Print summerazed result" print("") print("Summary of papers") print("-----------------") print("") if not (num_found == 0 and num_missing == 0): print("Database has %d paper(s)." % len(papers)) print("PDF files found for %d paper(s), %d missing." % (num_found, num_missing)) print("") headings = config.get("category_headings") categories = config.get("categories") # Make correct indentation for each attribute-value pair max_heading = max([len(headings[category]) for category in categories]) # Count number of papers in each category for category in categories: num_papers = len([paper for paper in papers if paper["category"] == category]) heading = headings[category] indentation = " " * (max_heading - len(heading)) print("%s: %s%d" % (heading, indentation, num_papers)) print("%s: %s%d" % ("Total", " " * (max_heading - len("Total")), len(papers)))
def _validate_paper_categories(paper): "Validate that no attributes are missing" # Check that category is specified if not "category" in paper: raise RuntimeError("Unable to validate paper, unknown category.") # Check that the paper holds all required attributes category = paper["category"] category_attributes = config.get("category_attributes") for attribute in category_attributes[category]: if isinstance(attribute, tuple): if not len([a for a in attribute if a in paper]) > 0: paper["invalid"] = True missing = str(attribute) break else: if not attribute in paper: paper["invalid"] = True missing = str(attribute) break if not is_valid(paper): print(' Skipping paper (missing attribute "%s")' % missing) if not config.get("autofix"): input(" Press return to continue.")
def html_format_articles(paper): "Return string for article in HTML format" values = [] # Title values.append(_html_format_title(paper)) # Author values.append(_html_get_authors_string(paper["author"])) # Journal values.append('<span class="%s_item_journal">%s</span>' % (config.get("html_class_prefix"), _format_venue(paper["journal"], paper["journal"], paper))) # Volume if "volume" in paper: values.append('<span class="%s_item_volum">vol. %s</span>' % (config.get("html_class_prefix"), paper["volume"])) # Pages if "pages" in paper: values.append('<span class="%s_item_pages">pp. %s</span>' % (config.get("html_class_prefix"), _html_format_pages(paper["pages"]))) # Year values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) # DOI #if "doi" in paper: values.append('[<a href="http://dx.doi.org/%s">DOI:%s</a>]' % (paper["doi"], paper["doi"])) if "doi" in paper: values.append('[<a href="http://dx.doi.org/%s">DOI</a>]' % paper["doi"]) # arXiv #if "arxiv" in paper: values.append('[<a href="http://arxiv.org/abs/%s">arXiv:%s</a>]' % (paper["arxiv"], paper["arxiv"])) if "arxiv" in paper: values.append('[<a href="http://arxiv.org/abs/%s">arXiv</a>]' % paper["arxiv"]) return _html_join(values)
def _validate_paper_title(paper): "Validate that the title is correct, fix capitalization" # Fix capitalization title = paper["title"] for separator in (" ", "-"): words = title.split(separator) lowercase = config.get("lowercase") uppercase = config.get("uppercase") new_words = [] for i in range(len(words)): word = words[i] if word == "": continue if word.lower() in lowercase: word = word.lower() elif word.lower() in uppercase: word = uppercase[word.lower()] else: word = word[0].upper() + word[1:] new_words.append(word) title = separator.join(new_words) paper["title"] = title
def _validate_paper_typos(paper): "Validate all paper strings for typos" typos = config.get("typos") # Check all attributes for attribute in paper: # Extract typos to check attribute_typos = typos["common"].copy() if attribute in typos: for typo in typos[attribute]: attribute_typos[typo] = typos[attribute][typo] # Get attribute value and convert to tuple value = paper[attribute] if isinstance(value, tuple): value_tuple = value else: value_tuple = (value, ) # Check all values in tuple new_values = [] for value in value_tuple: for typo in attribute_typos: replacement = attribute_typos[typo] # Check for typo if typo in value: print(" Incorrectly formatted %s string: %s" % (attribute, str(value))) if replacement is None: # Found no replacement, skip paper paper["invalid"] = True if config.get("autofix"): print(" Skipping paper") else: input( " Skipping paper, press return to continue.") return else: # Found replacement value = value.replace(typo, replacement) print(' Replacing typo "%s" with "%s".' % (typo, replacement)) if not config.get("autofix"): print(" Press return to continue.") new_values.append(value) # Assign corrected value if isinstance(paper[attribute], tuple): paper[attribute] = tuple(new_values) else: paper[attribute] = new_values[0]
def html_format_proceedings(paper): "Return string for proceeding in HTML format" values = [] values += [_html_format_title(paper)] values += [_html_get_authors_string(paper["author"])] values += ['in <span class="%s_item_booktitle">%s</span>' % (config.get("html_class_prefix"), paper["booktitle"])] values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) return _html_join(values)
def html_format_edited(paper): "Return string for edited book in HTML format" values = [] values += [_html_format_title(paper)] values += [_html_get_authors_string(paper["author"])] values += ['<span class="%s_item_publisher">%s</span>' % (config.get("html_class_prefix"), paper["publisher"])] values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) return _html_join(values)
def html_format_publicoutreach(paper): "Return string for public outreach in HTML format" values = [] values += [_html_format_title(paper)] values += [_html_get_authors_string(paper["author"])] values += ['<span class="%s_item_meeting">%s</span>' % (config.get("html_class_prefix"), paper["meeting"])] values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) return _html_join(values)
def html_format_theses(paper): "Return string for thesis in HTML format" values = [] values += [_html_format_title(paper)] values += [_html_get_authors_string(paper["author"])] values += [config.get("thesistype_strings")[paper["thesistype"]]] values += [paper["school"]] values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) return _html_join(values)
def _add_author(author_name): allowed_author_names = config.get("allowed_author_names") allowed_author_names.add(author_name) # Append to file filename = config.get("authornames_filename") try: file = open(filename, "a") file.write(author_name.strip() + "\n") except: raise RuntimeError('Unable to author to file: "%s".' % filename)
def html_format_chapters(paper): "Return string for chapter in HTML format" values = [] values += [_html_format_title(paper)] values += [_html_get_authors_string(paper["author"])] values += ['in <span class="%s_item_publisher">%s</span>' % (config.get("html_class_prefix"), paper["booktitle"])] if 'editor' in paper : values += [_html_format_editors(paper["editor"])] values += ['<span class="%s_item_publisher">%s</span>' % (config.get("html_class_prefix"), paper["publisher"])] if "chapter" in paper: values += ["chapter %s" % paper["chapter"]] if "pages" in paper: values += ["pp. %s" % _html_format_pages(paper["pages"])] values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) return _html_join(values)
def _validate_paper_pages(paper): "Validate page range" # Only check if we have pages if not "pages" in paper: return pages = paper["pages"] invalid = False new_pages = pages # Check if page page must contain "-" if config.get("require_page_range") and not "-" in pages: invalid = True if "-" in pages: if "--" in pages: first, last = pages.split("--")[:2] else: first, last = pages.split("-")[:2] if len(first) == 0 or len(last) == 0: invalid = True # Reformat string new_pages = first.strip() + config.get( "page_separator") + last.strip() # Check for invalid page string if invalid: paper["invalid"] = True print(" Incorrectly formatted page string: " + pages) if not config.get("autofix"): input(" Skipping paper, press return to continue.") else: print(" Skipping paper.") return # Check if string was changed if not new_pages == pages: print(" Incorrectly formatted page string: " + pages) print(" Suggested correction: " + new_pages) if ask_user_yesno( " Would you like to accept the suggested correction:"): print(" Correcting page string.") paper["pages"] = new_pages else: paper["invalid"] = True if not config.get("autofix"): input(" Skipping paper, press return to continue.") else: print(" Skipping paper.")
def _add_venue(venue_type, venue_name): "Add venue to known venues" # Append to list of known venues (remember at run-time) known_venues = config.get(venue_type + "s") known_venues.append(venue_name) # Append to file filename = config.get("local_venues_filename") try: file = open(filename, "a") file.write("%s: %s\n" % (venue_type, venue_name)) except: raise RuntimeError('Unable to add local venue to file "%s".' % filename)
def _rst_mark_author(author, text): "Mark the text with bold face if author is in the list of marked authors" if config.has_key("mark_author") and author.strip() in config.get( "mark_author"): return "_%s_" % text else: return text
def _xml_mark_author(author, text): "Mark the text with bold face if author is in the list of marked authors" if config.has_key("mark_author") and author.strip() in config.get( "mark_author"): return '<author marked="True">%s</author>' % text else: return '<author marked="False">%s</author>' % text
def save_invalid_papers(papers): "Save invalid papers to file" # Extract invalid papers invalid_papers = [] for paper in papers: if not is_valid(paper): invalid_papers.append(paper) # Don't save if there are no invalid papers if len(invalid_papers) == 0: return # Generate filename date = time.strftime("%Y%m%d-%H:%m:%S") invalid_filename = config.get( "invalid_filename_prefix") + "-" + date + ".pub" # Write to file text = pub.write(invalid_papers) print('Saving invalid papers to "%s".' % invalid_filename) try: file = open(invalid_filename, "w") file.write(text) file.close() except: raise RuntimeError('Unable to save invalid papers to file "%s"' % invalid_filename)
def save_database(merged_papers): "Save to database and make a backup copy if needed" database_filename = config.get("database_filename") # Generate text to be written to file text = pub.write(merged_papers) print("") # Make backup copy if needed (file size of generated file is different from the current) # TODO: Register if changes has been made and write backup file based on that # (instead of just comparing file sizes) if os.path.isfile(database_filename ) and len(text) != os.path.getsize(database_filename): backup_filename = database_filename + ".bak" print('Saving backup copy of database to file "%s"' % backup_filename) try: shutil.copyfile(database_filename, backup_filename) except: raise RuntimeError("Unable to create backup copy of database") # Open and read file print('Saving database to file "%s"' % database_filename) try: file = open(database_filename, "w") file.write(text) except UnicodeEncodeError as e: try: file.write(text.encode('utf-8')) except Exception as e: raise RuntimeError('Unable to save database to file "%s"\n%s' % (database_filename, str(e))) file.close()
def write(papers): "Format the given list of papers in the BibTeX format." text = "" for (i, paper) in enumerate(papers): entry_type = config.get("category2entrytype")[paper["category"]] if "key" in paper: key = paper["key"] else: key = "paper%d" % i text += "@%s{%s,\n" % (entry_type, key) for attribute in ordered_attributes(paper, _ignores): if attribute in ("entrytype", "key"): continue if attribute == "sortkey": attribute = "key" # sortkey becomes key in Bibtex if attribute == "author": value = " and ".join(paper["author"]) elif attribute == "editor": value = " and ".join(paper["editor"]) else: value = str(paper[attribute]) text += " %s = {%s},\n" % (attribute, value) text += "}\n" if not paper == papers[-1]: text += "\n" return text
def validate_file(filename=None): "Validate data in file" # Use default database if file is not specified if filename is None and not os.path.isfile( config.get("database_filename")): print("No file specified and no database found, nothing to do.") return # Open and read database papers = read_database(filename) # Validate papers (database_papers, invalid_papers) = validate_papers(papers) # Generate keys database_papers = generate_keys(papers) # Check for PDF files (num_found, num_missing) = check_pdf_files(papers) # Checking for duplicates database_papers = process_duplicates(database_papers) # Print summary print_summary(papers, num_found, num_missing) # Save papers to database save_database(database_papers) save_invalid_papers(invalid_papers)
def ask_user_alternatives(question, alternatives): "Ask for an option" while True: print(question) n = len(alternatives) for i in range(n): alternative = alternatives[i] print(" [%d] %s" % (i + 1, alternative)) numbers = ", ".join([str(i + 1) for i in range(n - 1)]) + " or " + str(n) if config.get("autofix"): print(" Autofix enabled, choosing default (1).") return 0 s = input("Please enter %s (or press return to choose [1]): " % numbers) if s.strip() == "": choice = 1 else: try: choice = int(s) except ValueError: # If answer could not be converted to int, set to illegal int value choice = -1 if (choice - 1) in range(n): return choice - 1 print("Illegal option.")
def _validate_paper_venue(paper): "Validate that the venue (journal, conference etc) is correct" # Get venue type category = paper["category"] category_venues = config.get("category_venues") venue_type = category_venues[category] # "journal", "booktitle", etc # Skip if venue is None (nothing to check) if venue_type is None: return # Get list of known venues known_venues = config.get(venue_type + "s") # Check that venue is valid venue_name = paper[venue_type] if not venue_name in known_venues: print("") print(' Unknown %s: "%s"' % (venue_type, venue_name)) suggested_venue = _suggest_venue(venue_name, known_venues) if suggested_venue is None: if ask_user_yesno( ' Would you like to add %s "%s"?' % (venue_type, venue_name), "no"): _add_venue(venue_type, venue_name) else: print(" Skipping paper.") paper["invalid"] = True else: print(' Suggested %s: "%s"' % (venue_type, suggested_venue)) alternative = ask_user_alternatives( " Unknown %s, what should I do?" % venue_type, ("Replace %s." % venue_type, "Add %s." % venue_type, "Skip paper.")) print("") if alternative == 0: paper[venue_type] = suggested_venue elif alternative == 1: _add_venue(venue_type, venue_name) elif alternative == 2: print(" Skipping paper (unable to guess the right %s)" % venue_type) input(" Press return to continue.") paper["invalid"] = True else: raise RuntimeError("Unknown option.")
def _html_mark_author(author, text) : "Mark the text with bold face if author is in the list of marked authors" if author.strip() in config.get("mark_author") : return "<strong>%s</strong>" % text else : return text
def _latex_mark_author(author, text) : "Mark the text with bold face if author is in the list of marked authors" if author.strip() in config.get("mark_author") : return "\\textbf{%s}" % text else : return text
def rst_format_theses(paper): "Return string for thesis in reSt format" values = [] values += [_rst_get_authors_string(paper)] values += [_rst_format_title(paper)] values += [config.get("thesistype_strings")[paper["thesistype"]]] values += [paper["school"]] values += [paper["year"]] return _rst_join(values)
def html_format_reports(paper): "Return string for report in HTML format" values = [] values += [_html_format_title(paper)] values += [_html_get_authors_string(paper["author"])] values += [paper["institution"]] values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) return _html_join(values)
def write(papers): "Format the given list of papers in the pub format." text = [] if config.get("use_standard_categories"): categories = config.get("categories") else: categories = set() for paper in papers: categories.add(paper["category"]) categories = list(categories) categories.sort() # Iterate over categories for category in categories: # Extract papers in category category_papers = [ paper for paper in papers if paper["category"] == category ] if len(category_papers) == 0: continue # Write category text.append("* %s\n" % category) # Iterate over papers in category for paper in category_papers: # Write title if "title" in paper: title = paper["title"] else: title = "missing" try: text.append("** %s\n" % title) except UnicodeDecodeError as e: text.append("** %s\n" % title.decode('utf-8')) # Write attributes text.append(write_paper(paper, ["category", "title"] + _ignores)) return "".join(text)
def export_file(filename, filters=[]): "Export data into desired file format" # Make sure we don't overwrite the database database_filename = config.get("database_filename") if filename == database_filename: raise RuntimeError('Papers cannot be exported to the default database ("%s").' % database_filename) # Read database database_papers = read_database(database_filename) # Why should the database be validated on export? #(valid_papers, invalid_papers) = validate_papers(database_papers) # Filter papers filtered_papers = filter_papers(database_papers, filters) # Get the filename suffix suffix = filename.split(".")[-1] # Choose format based on suffix if suffix in ("bib", "bibtex"): write = bibtex.write elif suffix == "pub": write = pub.write elif suffix == "tex": write = latex.write elif suffix == "pdf": write = pdf.write elif suffix == "html": write = html.write elif suffix == "rst": write = rst.write elif suffix == "graphml" : write = graphml.write else: raise RuntimeError("Unknown file format.") # Open and read file text = write(filtered_papers) file = open(filename, "w") try: file.write(text) except UnicodeEncodeError as e: file.write(text.encode('utf-8')) except Exception as e: raise RuntimeError('Unable to write file "%s" (exception %s: %s)' % (filename, type(e), e)) file.close() # Print summary print_summary(filtered_papers) print("") print("Exported %d paper(s) to %s." % (len(filtered_papers), filename))
def html_format_misc(paper): "Return string for misc in HTML format" values = [] values += [_html_format_title(paper)] values += [_html_get_authors_string(paper["author"])] if "howpublished" in paper: howpublished = paper["howpublished"] if "http://" in howpublished and "<a href" not in values[0]: link = ("http://" + howpublished.split("http://")[-1]).strip() values[0] = '<a href="%s">%s</a>' % (link, values[0]) else: values += [howpublished] if "booktitle" in paper: values += ["in <i>%s</i>" % paper["booktitle"]] if "meeting" in paper: values += [paper["meeting"]] if "thesistype" in paper: values += [config.get("thesistype_strings")[paper["thesistype"]]] if "school" in paper: values += [paper["school"]] if "chapter" in paper: values += ["chapter %s" % paper["chapter"]] if "volume" in paper: values += ["vol. %s" % paper["volume"]] if "pages" in paper: values += ["pp. %s" % _html_format_pages(paper["pages"])] if "year" in paper: values.append('<span class="%s_item_year">%s</span>' % (config.get("html_class_prefix"), paper["year"])) return _html_join(values)
def _html_get_authors_string(authors): "Convert author tuple to author string" authors = [_html_mark_author(author, short_author(author).strip()) for author in authors] if len(authors) == 1: str = authors[0] else : if authors[-1] == "others": str = ", ".join(authors[:-1]) + " et al." else: str = ", ".join(authors[:-1]) + " and " + authors[-1] return '<span class="%s_item_authors">%s</span>' % (config.get("html_class_prefix"), str)
def _xml_mark_author(author, text): "Mark the text with bold face if author is in the list of marked authors" if config.has_key("mark_author") and author.strip() in config.get("mark_author") : return '<author marked="True">%s</author>' % text else: return '<author marked="False">%s</author>' % text
def _rst_mark_author(author, text) : "Mark the text with bold face if author is in the list of marked authors" if config.has_key("mark_author") and author.strip() in config.get("mark_author") : return "_%s_" % text else: return text