def __init__(self, ui): self._ui = ui self._cfg = Config() # Check if we have crossref credentials and set them via environment # variable. The environment variables are read by crossref_commons if self._cfg.get('crossref_plus'): LOGGER.info("Setting Crossref Plus token") os.environ['CR_API_PLUS'] = self._cfg.get('crossref_plus') if (self._cfg.get('crossref_mailto') and len(self._cfg.get('crossref_mailto')) > 0): # TODO make version dynamic os.environ['CR_API_AGENT'] = \ ('BibChex/0.1 ' '(https://github.com/tinloaf/bibchex; mailto:{})').format( self._cfg.get('crossref_mailto')) os.environ['CR_API_MAILTO'] = self._cfg.get('crossref_mailto') else: LOGGER.warning( ("\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" " Please set crossref_mailto in your config! \n" " Not setting crossref_mailto may cause all your CrossRef" " requests to fail." "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")) os.environ['CR_API_AGENT'] = \ 'BibChex/0.1 (https://github.com/tinloaf/bibchex)'
def __init__(self): self._cfg = Config() self._name = type(self).NAME self._cls = type(self) if self._name not in GenericFuzzySimilarityChecker.SEEN_NAMES: GenericFuzzySimilarityChecker.SEEN_NAMES[self._name] = set()
class InitialDottedChecker(object): NAME = 'author_initial_dotted' def __init__(self): self._cfg = Config() async def check(self, entry): authors = await self.check_one("authors", "Author", entry) editors = await self.check_one("editors", "Editor", entry) return authors + editors async def check_one(self, field, name, entry): should_dot = self._cfg.get('author_initial_want_dotted', entry, True) problems = [] for author in getattr(entry, field): (first, last) = author words = first.split(" ") + last.split(" ") for word in words: if len(word) == 0: continue if not any(c.islower() for c in word): if should_dot and word[-1] != '.': problems.append( (type(self).NAME, "{} {} {} seems to have an undotted initial." .format(name, first, last), "")) if not should_dot and word.find('.') != -1: problems.append( (type(self).NAME, "{} {} {} seems to have a dotted initial." .format(name, first, last), "")) return problems
def __init__(self): self._cfg = Config() self._name = type(self).NAME self._cls = type(self) if self._name not in GenericAbbrevChecker.SEEN_NAMES: GenericAbbrevChecker.SEEN_NAMES[self._name] = set()
class RequiredFieldsChecker(object): NAME = "required_fields" def __init__(self): self._cfg = Config() async def check(self, entry): problems = [] required_fields = self._cfg.get('required', entry) for field_raw in required_fields: field = field_raw.lower() if field == 'author': # Special handling if len(entry.authors) == 0: problems.append((type(self).NAME, "Required field 'author' missing", "")) elif field == 'editor': # Special handling if len(entry.editors) == 0: problems.append((type(self).NAME, "Required field 'editor' missing", "")) else: if field not in entry.data: problems.append( (type(self).NAME, "Required field '{}' missing".format(field), "")) return problems
class ForbiddenFieldsChecker(object): NAME = "forbidden_fields" def __init__(self): self._cfg = Config() async def check(self, entry): problems = [] forbidden_fields = self._cfg.get('forbidden', entry, []) for field_raw in forbidden_fields: field = field_raw.lower() if field == 'author': # Special handling if len(entry.authors) > 0: problems.append((type(self).NAME, "Forbidden field 'author' present", "")) if field == 'editor': # Special handling if len(entry.editors) > 0: problems.append((type(self).NAME, "Forbidden field 'editor' present", "")) else: if field in entry.data: problems.append( (type(self).NAME, "Forbidden field '{}' present".format(field), "")) return problems
def __init__(self, filename, out_filename): self._fname = filename self._out_filename = out_filename self._bibtex_data = None self._entries = {} self._suggestions = {} self._retrieval_errors = [] self._diffs = [] self._problems = [] self._global_problems = [] self._unifier = Unifier() self._ui = UI() self._cfg = Config()
def __init__(self, ui): self._ui = ui self._cfg = Config() # dx.doi.org (sometimes) has very harsh rate limits. This seems to be # some cloudflare magic self._ratelimit = AsyncRateLimiter(50, 10) self._doi_ratelimit = AsyncRateLimiter(20, 10) self._max_retries = 5 self._retry_pause = 10 # Wait an additional 10 seconds before a retry
async def complete(cls, ui): cfg = Config() def compute(seen_names, chunk_count, chunk_number): problems = [] # nn1/nn2 are the normalized forms of the names for ((n1, nn1), (n2, nn2)) in chunked_pairs( list(seen_names), chunk_count, chunk_number): if (nn1 == nn2): continue if fuzz.partial_ratio(nn1, nn2) > 90: # TODO make configurable problems.append((name, "{} names '{}' and '{}' seem very similar." .format(cls.MSG_NAME, n1, n2), "")) return problems name = cls.NAME item_count = len(GenericFuzzySimilarityChecker.SEEN_NAMES[name]) LOGGER.info((f"Fuzzy-checking pairwise similarity " f"of {cls.MSG_NAME}s. Testing " f"{item_count*(item_count - 1) / 2 - item_count} pairs. " "This might take a while.")) collected_problems = [] chunk_count = min(len(os.sched_getaffinity(0)) * 10, len(GenericFuzzySimilarityChecker.SEEN_NAMES[name])) tasks = [] for i in range(0, chunk_count): tasks.append( asyncio.get_event_loop().run_in_executor( cfg.get_executor(), compute, GenericFuzzySimilarityChecker.SEEN_NAMES[name], chunk_count, i)) collected_results = await asyncio.gather(*tasks) # Flatten lists return [item for sublist in collected_results for item in sublist]
def main(passed_args=None): if passed_args is None: passed_args = sys.argv[1:] args = parser.parse_args(passed_args) if args.ui_gui: UI.select_gui() elif args.ui_cli: UI.select_cli() elif args.ui_silent: UI.select_silent() if args.config: Config(args.config) else: home = os.path.expanduser("~") user_cfg = os.path.join(home, '.config', 'bibchex.json') if os.path.isfile(user_cfg): Config(user_cfg) else: Config() ui = UI() loop = asyncio.get_event_loop() loop.set_debug(True) loop.set_default_executor(concurrent.futures.ThreadPoolExecutor(20)) try: c = Checker(args.input_file[0], args.output_file[0]) loop.run_until_complete(c.run()) except Exception as e: exc_str = traceback.format_exc() ui.error("Exception", str(e)) ui.error("Traceback", exc_str) ui.wait()
class PreferDateChecker(object): NAME = 'prefer_date' def __init__(self): self._cfg = Config() async def check(self, entry): if ((entry.data.get('date') is None) and ((any((entry.data.get(key) for key in ('year', 'month', 'day'))) and not self._cfg.get('prefer_date_or_year', entry, True)) or (any((entry.data.get(key) for key in ('month', 'day')))))): return [(type(self).NAME, ("The 'date' field is preferred over " "the 'day/month/year' fields."), "")] return []
class JournalAbbrevChecker(object): NAME = 'journal_abbrev' FIELDS = ['booktitle', 'journal'] def __init__(self): self._cfg = Config() async def check(self, entry): problems = [] acceptable = set(self._cfg.get("acceptable_abbreviations", entry, [])) for field in JournalAbbrevChecker.FIELDS: val = entry.data.get(field, '') if contains_abbreviation(val, acceptable=acceptable): problems.append( (type(self).NAME, "Publication title '{}' seems to contain an abbreviation". format(val), "")) return problems
class ISBNFormatChecker(object): NAME = "isbn_format" def __init__(self): self._cfg = Config() async def check(self, entry): fmt = self._cfg.get('isbn_format', entry) if not fmt: return [] isbn = entry.data.get('isbn') if not isbn: return [] clean_isbn = clean(isbn) if not clean_isbn or notisbn(clean_isbn): return [] if fmt not in ('canonical', 'masked'): raise ConfigurationError("The option 'isbn_format' must be \ either of 'canonical' or 'masked'.") if fmt == 'canonical': cisbn = canonical(clean_isbn) if cisbn != isbn: return [(type(self).NAME, "ISBN '{}' is not in canonical format.".format(isbn), "Canonical format would be '{}'".format(cisbn))] elif fmt == 'masked': misbn = mask(clean_isbn) if misbn != isbn: return [(type(self).NAME, "ISBN '{}' is not in masked format.".format(isbn), "Masked format would be '{}'".format(misbn))] return []
class ISBNLengthChecker(object): NAME = "isbn_length" def __init__(self): self._cfg = Config() async def check(self, entry): length = self._cfg.get('isbn_length', entry, 13) if not length: return [] isbn = entry.data.get('isbn') if not isbn: return [] clean_isbn = clean(isbn) if not clean_isbn or notisbn(clean_isbn): return [] if length not in (10, 13): raise ConfigurationError( "The option 'isbn_length' must be either of 10 or 13.") if length == 10: if not is_isbn10(clean_isbn): return [(type(self).NAME, "ISBN '{}' is not of length 10.".format(isbn), "ISBN-10 would be '{}'".format(to_isbn10(clean_isbn))) ] elif length == 13: if not is_isbn13(clean_isbn): return [(type(self).NAME, "ISBN '{}' is not of length 13.".format(isbn), "ISBN-13 would be '{}'".format(to_isbn13(clean_isbn))) ] return []
class GenericStringFormatChecker(object): PROVIDE_FIELDS = ['year', 'date'] def __init__(self): self._cfg = Config() self._name = type(self).NAME self._field = type(self).FIELD self._cls = type(self) async def check(self, entry): problems = [] formats = self._cfg.get(self._cls.FORMAT_FIELD, entry) field_data = entry.data.get(self._field) if formats is None or field_data is None: return [] entry_data = {field: str(entry.data.get(field, '')) for field in GenericStringFormatChecker.PROVIDE_FIELDS} # special 'short year' entry_data['short_year'] = entry_data['year'][-2:] if not isinstance(formats, list): formats = [formats] for form in formats: re_str = form.format(**entry_data) m = re.match(re_str, field_data) if m: # Everything A-Okay return [] return [(self._name, f"Format for field {self._field} incorrect.", "")]
def __init__(self, entry): self._entry = entry self._cfg = Config()
class Differ(object): FIELD_PROPERTIES = { 'doi': { 'case': False }, 'title': { 'first_letter_case': False }, 'isbn': { 'diff_func': isbn_differ }, 'issn': { 'list': True, 'ignore_order': True } } def __init__(self, entry): self._entry = entry self._cfg = Config() def diff(self, suggestion): """Compute and return a list of differences between the entity of this Differ and the list of suggestions passed.""" return self._diff_general(suggestion) + \ self._diff_people('authors', suggestion) + \ self._diff_people('editors', suggestion) def _diff_people(self, field, suggestion): diffs = [] assert field in ('authors', 'editors') if field == 'authors': singular = 'Author' else: singular = 'Editor' sugg_field = getattr(suggestion, field) entry_field = getattr(self._entry, field) if len(sugg_field) == 0: return [] for i in range(0, max(len(sugg_field), len(entry_field))): if i >= len(sugg_field): diffs.append( Difference( self._entry.get_id(), suggestion.source, '{} {}'.format(singular, i + 1), 'Person not present in retrieved \ {}: {} {}'.format(field, *entry_field[i]))) continue if i >= len(entry_field): diffs.append( Difference( self._entry.get_id(), suggestion.source, '{} {}'.format(singular, i + 1), 'Additional person in retrieved \ {}: {} {}'.format(field, *sugg_field[i]))) continue entry_first, entry_last = entry_field[i] sugg_first, sugg_last = sugg_field[i] raw_sugg_first = sugg_first raw_sugg_last = sugg_last if self._cfg.get('authors_ignore_allcaps', self._entry, True): if is_allcaps("{} {}".format(sugg_first, sugg_last)): sugg_first = sugg_first.lower() sugg_last = sugg_last.lower() entry_first = entry_first.lower() entry_last = entry_last.lower() difference = False if crush_spaces(sugg_last) != crush_spaces(entry_last): difference = True # Check first names individually entry_first_words = entry_first.split(" ") sugg_first_words = entry_first.split(" ") if len(entry_first_words) != len(sugg_first_words): difference = True else: for (word_e, word_s) in zip(entry_first_words, sugg_first_words): if not is_initial(word_e) and not is_initial(word_s): difference |= (word_e != word_s) else: difference |= (word_s[0] != word_e[0]) if difference: diffs.append( Difference( self._entry.get_id(), suggestion.source, '{} {}'.format(singular, i + 1), 'Suggested {} name: \ {} {}'.format(singular.lower(), raw_sugg_first, raw_sugg_last))) return diffs def _diff_general(self, suggestion): diffs = [] # Find fields where we have data in the entry, which is different from # the data in the suggestion for field in self._entry.data.keys(): if field in suggestion.data: suggestion_data = suggestion.data[field] entry_data = unlatexify(self._entry.data[field]) # Cast everything to string suggestion_data = [(str(d), kind) for (d, kind) in suggestion_data] # Unify hyphens entry_data = unify_hyphens(entry_data) suggestion_data = [(unify_hyphens(d), kind) for (d, kind) in suggestion_data] # Crush spaces entry_data = crush_spaces(entry_data) suggestion_data = [(crush_spaces(d), kind) for (d, kind) in suggestion_data] field_props = Differ.FIELD_PROPERTIES.get(field, {}) if not field_props.get('case', True): entry_data = entry_data.lower() suggestion_data = [(d.lower(), kind) for (d, kind) in suggestion_data] if not field_props.get('first_letter_case', True): entry_data = lower_case_first_letters(entry_data) suggestion_data = [(lower_case_first_letters(d), kind) for (d, kind) in suggestion_data] if 'diff_func' in field_props: # Diff func must handle plain / re on itself! if field_props['diff_func'](entry_data, suggestion_data): diffs.append( Difference( self._entry.get_id(), suggestion.source, field, [d for (d, kind) in suggestion.data[field]])) elif not field_props.get('list', False): hit = False for (d, kind) in suggestion_data: if kind == Suggestion.KIND_RE: if re.match(d, entry_data): hit = True else: # Plain hit |= (entry_data == d) if not hit: diffs.append( Difference( self._entry.get_id(), suggestion.source, field, [d for (d, kind) in suggestion.data[field]])) else: # List comparison entry_list = [e.strip() for e in entry_data.split(',')] hit = False for (d, kind) in suggestion_data: if kind == Suggestion.KIND_RE: LOGGER.error( "List-of-regex is not supported as suggestion." ) continue sugg_list = [e.strip() for e in d.split(',')] if field_props.get('ignore_order', False): hit |= (set(entry_list) == set(sugg_list)) else: hit |= entry_list == sugg_list if not hit: diffs.append( Difference( self._entry.get_id(), suggestion.source, field, [d for (d, kind) in suggestion.data[field]])) # Find fields in the 'wanted' option for which the suggestion has data, # but the entry has not. wanted = set(self._cfg.get('wanted', self._entry, [])) forbidden = set(self._cfg.get('forbidden', self._entry, [])) wanted = wanted - forbidden for field in wanted: if field not in self._entry.data and field in suggestion.data: diffs.append( Difference(self._entry.get_id(), suggestion.source, field, [d for (d, kind) in suggestion.data[field]])) return diffs
class CrossrefSource(object): QUERY_FIELDS = ['doi'] DOI_URL_RE = re.compile(r'https?://(dx\.)?doi\.org/.*') def __init__(self, ui): self._ui = ui self._cfg = Config() # Check if we have crossref credentials and set them via environment # variable. The environment variables are read by crossref_commons if self._cfg.get('crossref_plus'): LOGGER.info("Setting Crossref Plus token") os.environ['CR_API_PLUS'] = self._cfg.get('crossref_plus') if (self._cfg.get('crossref_mailto') and len(self._cfg.get('crossref_mailto')) > 0): # TODO make version dynamic os.environ['CR_API_AGENT'] = \ ('BibChex/0.1 ' '(https://github.com/tinloaf/bibchex; mailto:{})').format( self._cfg.get('crossref_mailto')) os.environ['CR_API_MAILTO'] = self._cfg.get('crossref_mailto') else: LOGGER.warning( ("\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" " Please set crossref_mailto in your config! \n" " Not setting crossref_mailto may cause all your CrossRef" " requests to fail." "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")) os.environ['CR_API_AGENT'] = \ 'BibChex/0.1 (https://github.com/tinloaf/bibchex)' def _get_doi_blocking(self, entry, step): """ Steps: 1: query by title + authors (first and last names) 2: query by title + authors (only last names) 3: query by title """ title = entry.data.get('title') if title is None: # Without a title, we're chanceless. return None q = [('bibliographic', title)] if step in (1, 2): for (first, last) in entry.authors: if step == 2: q.append(('author', "{}".format(last))) else: q.append(('author', "{} {}".format(first, last))) try: (count, results) = crossref_commons.search.search_publication( q, sort="relevance", order="desc") except Exception as e: LOGGER.error((f"Error reverse-searching for {entry.get_id()}: " f"{e}")) return None if count > 0 and results: for i in range(0, min(10, count)): if 'title' not in results[i] or 'DOI' not in results[i]: # Bogus data continue suggested_title = results[i]['title'] doi = results[i]['DOI'] if not isinstance(suggested_title, list): suggested_title = [suggested_title] for possibility in suggested_title: fuzz_score = fuzz.partial_ratio(title.lower(), possibility.lower()) if fuzz_score >= self._cfg.get('doi_fuzzy_threshold', entry, 90): return doi return None async def get_doi(self, entry): loop = asyncio.get_event_loop() done = False retries = 20 backoff = 1 problem = None result = None step = 1 self._ui.increase_subtask('CrossrefDOI') try: while not done: try: result = await loop.run_in_executor( None, partial(self._get_doi_blocking, entry, step)) if result: done = True else: # Too specific search? Loosen search terms if step < 3: step += 1 else: done = True except RateLimitException: if retries == 0: return (None, RetrievalProblem("Too many retries")) await asyncio.sleep(backoff) backoff = backoff * 2 retries -= 1 done = False except RetrievalProblem as e: problem = e self._ui.finish_subtask('CrossrefDOI') return (result, problem) async def query(self, entry): loop = asyncio.get_event_loop() done = False retries = 20 backoff = 1 problem = None result = None self._ui.increase_subtask('CrossrefQuery') try: while not done: try: result = await loop.run_in_executor( None, partial(self._query_blocking, entry)) done = True except RateLimitException: if retries == 0: return (None, RetrievalProblem("Too many retries")) await asyncio.sleep(backoff) backoff = backoff * 2 retries -= 1 done = False except RetrievalProblem as e: problem = e return (result, problem) def _query_blocking(self, entry): doi = entry.get_probable_doi() if not doi: self._ui.finish_subtask('CrossrefQuery') return None try: data = crossref_commons.retrieval.get_publication_as_json(doi) except ValueError as e: self._ui.finish_subtask('CrossrefQuery') if str(e) == f"DOI {doi} does not exist": # This isn't really an error, CrossRef just does not know # about them pass else: LOGGER.error((f"Error retrieving data for {entry.get_id()}. " f"{e}")) return None except ConnectionError as e: # TODO retry? self._ui.finish_subtask('CrossrefQuery') LOGGER.error( (f"Connection error retrieving data for {entry.get_id()}. " f"{e}")) return None s = Suggestion("crossref", entry) # Special handling for type btype = TYPE_MAPPING.get(data['type']) if not btype: LOGGER.warn( "Type {} not found in crossref source. (Entry {})".format( data['type'], entry.get_id())) else: s.add_field('entrytype', btype) # Special handling for authors for author_data in data.get('author', []): s.add_author( author_data.get('given', "").strip(), author_data.get('family', "").strip()) # Special handling for editors for editor_data in data.get('editor', []): s.add_editor( editor_data.get('given', "").strip(), editor_data.get('family', "").strip()) # Special handling for journal / book title if btype in ['journal-article', 'book-chapter']: journal = flexistrip(data.get('container-title')) if journal: s.add_field('journal', journal) # Special handling for URL. Only take it if it's not a DOI-Url url = flexistrip(data.get('URL')) if url and (CrossrefSource.DOI_URL_RE.match(url) is None): s.add_field('url', url) # All other fields for field_from, field_to in FIELD_MAPPING.items(): if isinstance(field_to, dict): if entry.data['entrytype'] in field_to: field_to = field_to[entry.data['entrytype']] else: field_to = field_to.get('default') if not field_to: continue if field_from in data: s.add_field(field_to, flexistrip(data[field_from])) self._ui.finish_subtask('CrossrefQuery') return s
class Checker(object): def __init__(self, filename, out_filename): self._fname = filename self._out_filename = out_filename self._bibtex_data = None self._entries = {} self._suggestions = {} self._retrieval_errors = [] self._diffs = [] self._problems = [] self._global_problems = [] self._unifier = Unifier() self._ui = UI() self._cfg = Config() async def run(self): LOGGER.info("Parsing BibTeX") self._parse() LOGGER.info("Applying unification rules") self._unify() LOGGER.info("Retrieving missing DOIs") await self._find_dois() LOGGER.info("Retrieving metadata") await self._retrieve() LOGGER.info("Calculating differences") self._diff() LOGGER.info("Running consistency checks") await self._check_consistency() # TODO Retrieval Errors should be part of the HTML output self._filter_diffs() self._filter_problems() LOGGER.info("Writing output") self._output() LOGGER.info("Done.") def _filter_diffs(self): filtered_diffs = [diff for diff in self._diffs if not self._entries[diff.entry_id] .should_ignore_diff(diff.source, diff.field)] self._diffs = filtered_diffs def _filter_problems(self): filtered_probs = [prob for prob in self._problems if not self._entries[prob.entry_id] .should_ignore_problem(prob.problem_type)] self._problems = filtered_probs def _output(self): html_out = HTMLOutput(list(self._entries.values()), self._diffs, self._problems, self._global_problems, self._fname) html_out.write(self._out_filename) def _print_retrieval_errors(self): LOGGER.warn("############################################") LOGGER.warn("## Errors occurred during retrieval ##") LOGGER.warn("############################################") for p in self._retrieval_errors: LOGGER.warn("main", " - {}".format(p)) def _diff(self): for (_, entry) in self._entries.items(): d = Differ(entry) for s in self._suggestions.get(entry.get_id(), []): self._diffs.extend(d.diff(s)) async def _check_consistency(self): tasks = [] task_info = [] for CChecker in CCHECKERS: if hasattr(CChecker, 'reset'): await CChecker.reset() for CChecker in CCHECKERS: for entry in self._entries.values(): ccheck = CChecker() if self._cfg.get("check_{}".format(CChecker.NAME), entry, True): task = ccheck.check(entry) task_info.append((CChecker, entry)) tasks.append(task) results = await asyncio.gather(*tasks) for ((CChecker, entry), problems) in zip(task_info, results): for (problem_type, message, details) in problems: self._problems.append( Problem(entry.get_id(), CChecker.NAME, problem_type, message, details)) for CChecker in CCHECKERS: if hasattr(CChecker, 'complete'): global_results = await CChecker.complete(self._ui) for (problem_type, message, details) in global_results: self._global_problems.append( Problem(None, CChecker.NAME, problem_type, message, details)) async def _find_dois(self): cs = CrossrefSource(self._ui) entry_order = (entry for entry in self._entries.values() if entry.get_doi() is None) # Filter out entries for which bibchex-nodoi is set. entry_order = list( filter(lambda e: not e.options.get('nodoi', False), entry_order)) tasks = [] for entry in entry_order: task = cs.get_doi(entry) tasks.append(task) results = await asyncio.gather(*tasks) for (entry, (result, retrieval_error)) in zip(entry_order, results): if result: entry.add_suggested_doi(result) if retrieval_error: self._retrieval_errors.append(retrieval_error) async def _retrieve(self): entry_order = [] tasks = [] indices = [] for SourceClass in SOURCES: # for SourceClass in [ DataCiteSource ]: source = SourceClass(self._ui) i = 0 for entry in self._entries.values(): task = source.query(entry) entry_order.append(entry) tasks.append(task) indices.append(i) i += 1 results = await asyncio.gather(*tasks) for (entry_index, raw_result) in zip(indices, results): entry = entry_order[entry_index] if not isinstance(raw_result, list): raw_result = [raw_result] for (result, retrieval_error) in raw_result: if result: # Unify all suggested data self._unifier.unify_suggestion(result) self._suggestions[entry.get_id()].append(result) if retrieval_error: if isinstance(retrieval_error, list): self._retrieval_errors.extend(retrieval_error) else: self._retrieval_errors.append(retrieval_error) def _unify(self): for entry in self._entries.values(): assert entry.get_id() not in self._suggestions self._suggestions[entry.get_id()] = [ self._unifier.unify_entry(entry) ] def _parse(self): with open(self._fname) as bibtex_file: parser = bibtexparser.bparser.BibTexParser( common_strings=True) # TODO how much of my own magic ist still necessary here? # parser.customization = bibtexparser.customization.\ # homogenize_latex_encoding self._bibtex_data = parser.parse_file(bibtex_file) entry_list = [Entry(bentry, self._ui) for bentry in self._bibtex_data.entries] entry_keys = set((entry.get_id() for entry in entry_list)) if len(entry_keys) != len(entry_list): LOGGER.error("ERROR! Duplicate keys detected!") sys.exit(-1) self._entries = {entry.get_id(): entry for entry in entry_list}
def __init__(self): self._cfg = Config()
def __init__(self): self._cfg = Config() self._name = type(self).NAME self._field = type(self).FIELD self._cls = type(self)