示例#1
0
    def __init__(self, ui):
        self._ui = ui
        self._cfg = Config()

        # Check if we have crossref credentials and set them via environment
        # variable. The environment variables are read by crossref_commons
        if self._cfg.get('crossref_plus'):
            LOGGER.info("Setting Crossref Plus token")
            os.environ['CR_API_PLUS'] = self._cfg.get('crossref_plus')
        if (self._cfg.get('crossref_mailto')
                and len(self._cfg.get('crossref_mailto')) > 0):
            # TODO make version dynamic
            os.environ['CR_API_AGENT'] = \
                ('BibChex/0.1 '
                 '(https://github.com/tinloaf/bibchex; mailto:{})').format(
                     self._cfg.get('crossref_mailto'))
            os.environ['CR_API_MAILTO'] = self._cfg.get('crossref_mailto')
        else:
            LOGGER.warning(
                ("\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
                 "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
                 " Please set crossref_mailto in your config! \n"
                 " Not setting crossref_mailto may cause all your CrossRef"
                 " requests to fail."
                 "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
                 "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"))
            os.environ['CR_API_AGENT'] = \
                'BibChex/0.1 (https://github.com/tinloaf/bibchex)'
示例#2
0
    def __init__(self):
        self._cfg = Config()
        self._name = type(self).NAME
        self._cls = type(self)

        if self._name not in GenericFuzzySimilarityChecker.SEEN_NAMES:
            GenericFuzzySimilarityChecker.SEEN_NAMES[self._name] = set()
示例#3
0
class InitialDottedChecker(object):
    NAME = 'author_initial_dotted'

    def __init__(self):
        self._cfg = Config()

    async def check(self, entry):
        authors = await self.check_one("authors", "Author", entry)
        editors = await self.check_one("editors", "Editor", entry)
        return authors + editors

    async def check_one(self, field, name, entry):
        should_dot = self._cfg.get('author_initial_want_dotted', entry, True)
        problems = []
        for author in getattr(entry, field):
            (first, last) = author

            words = first.split(" ") + last.split(" ")
            for word in words:
                if len(word) == 0:
                    continue
                if not any(c.islower() for c in word):
                    if should_dot and word[-1] != '.':
                        problems.append(
                            (type(self).NAME,
                             "{} {} {} seems to have an undotted initial."
                             .format(name, first, last), ""))

                    if not should_dot and word.find('.') != -1:
                        problems.append(
                            (type(self).NAME,
                             "{} {} {} seems to have a dotted initial."
                             .format(name, first, last), ""))

        return problems
示例#4
0
    def __init__(self):
        self._cfg = Config()
        self._name = type(self).NAME
        self._cls = type(self)

        if self._name not in GenericAbbrevChecker.SEEN_NAMES:
            GenericAbbrevChecker.SEEN_NAMES[self._name] = set()
示例#5
0
class RequiredFieldsChecker(object):
    NAME = "required_fields"

    def __init__(self):
        self._cfg = Config()

    async def check(self, entry):
        problems = []

        required_fields = self._cfg.get('required', entry)
        for field_raw in required_fields:
            field = field_raw.lower()

            if field == 'author':
                # Special handling
                if len(entry.authors) == 0:
                    problems.append((type(self).NAME,
                                     "Required field 'author' missing", ""))
            elif field == 'editor':
                # Special handling
                if len(entry.editors) == 0:
                    problems.append((type(self).NAME,
                                     "Required field 'editor' missing", ""))
            else:
                if field not in entry.data:
                    problems.append(
                        (type(self).NAME,
                         "Required field '{}' missing".format(field), ""))

        return problems
示例#6
0
class ForbiddenFieldsChecker(object):
    NAME = "forbidden_fields"

    def __init__(self):
        self._cfg = Config()

    async def check(self, entry):
        problems = []

        forbidden_fields = self._cfg.get('forbidden', entry, [])
        for field_raw in forbidden_fields:
            field = field_raw.lower()

            if field == 'author':
                # Special handling
                if len(entry.authors) > 0:
                    problems.append((type(self).NAME,
                                     "Forbidden field 'author' present", ""))
            if field == 'editor':
                # Special handling
                if len(entry.editors) > 0:
                    problems.append((type(self).NAME,
                                     "Forbidden field 'editor' present", ""))
            else:
                if field in entry.data:
                    problems.append(
                        (type(self).NAME,
                         "Forbidden field '{}' present".format(field), ""))

        return problems
示例#7
0
    def __init__(self, filename, out_filename):
        self._fname = filename
        self._out_filename = out_filename

        self._bibtex_data = None
        self._entries = {}
        self._suggestions = {}

        self._retrieval_errors = []
        self._diffs = []
        self._problems = []
        self._global_problems = []

        self._unifier = Unifier()

        self._ui = UI()
        self._cfg = Config()
示例#8
0
 def __init__(self, ui):
     self._ui = ui
     self._cfg = Config()
     # dx.doi.org (sometimes) has very harsh rate limits. This seems to be
     # some cloudflare magic
     self._ratelimit = AsyncRateLimiter(50, 10)
     self._doi_ratelimit = AsyncRateLimiter(20, 10)
     self._max_retries = 5
     self._retry_pause = 10  # Wait an additional 10 seconds before a retry
示例#9
0
    async def complete(cls, ui):
        cfg = Config()

        def compute(seen_names, chunk_count, chunk_number):
            problems = []
            # nn1/nn2 are the normalized forms of the names
            for ((n1, nn1), (n2, nn2)) in chunked_pairs(
                    list(seen_names), chunk_count, chunk_number):
                if (nn1 == nn2):
                    continue

                if fuzz.partial_ratio(nn1, nn2) > 90:  # TODO make configurable
                    problems.append((name,
                                     "{} names '{}' and '{}' seem very similar."
                                     .format(cls.MSG_NAME, n1, n2),
                                     ""))
            return problems

        name = cls.NAME
        item_count = len(GenericFuzzySimilarityChecker.SEEN_NAMES[name])
        LOGGER.info((f"Fuzzy-checking pairwise similarity "
                     f"of {cls.MSG_NAME}s. Testing "
                     f"{item_count*(item_count - 1) / 2 - item_count} pairs. "
                     "This might take a while."))

        collected_problems = []
        chunk_count = min(len(os.sched_getaffinity(0)) * 10,
                          len(GenericFuzzySimilarityChecker.SEEN_NAMES[name]))
        tasks = []
        for i in range(0, chunk_count):
            tasks.append(
                asyncio.get_event_loop().run_in_executor(
                    cfg.get_executor(),
                    compute, GenericFuzzySimilarityChecker.SEEN_NAMES[name],
                    chunk_count, i))

        collected_results = await asyncio.gather(*tasks)

        # Flatten lists
        return [item for sublist in collected_results for item in sublist]
示例#10
0
def main(passed_args=None):
    if passed_args is None:
        passed_args = sys.argv[1:]

    args = parser.parse_args(passed_args)

    if args.ui_gui:
        UI.select_gui()
    elif args.ui_cli:
        UI.select_cli()
    elif args.ui_silent:
        UI.select_silent()

    if args.config:
        Config(args.config)
    else:
        home = os.path.expanduser("~")
        user_cfg = os.path.join(home, '.config', 'bibchex.json')
        if os.path.isfile(user_cfg):
            Config(user_cfg)
        else:
            Config()

    ui = UI()

    loop = asyncio.get_event_loop()
    loop.set_debug(True)
    loop.set_default_executor(concurrent.futures.ThreadPoolExecutor(20))

    try:
        c = Checker(args.input_file[0], args.output_file[0])
        loop.run_until_complete(c.run())
    except Exception as e:
        exc_str = traceback.format_exc()
        ui.error("Exception", str(e))
        ui.error("Traceback", exc_str)

    ui.wait()
示例#11
0
class PreferDateChecker(object):
    NAME = 'prefer_date'

    def __init__(self):
        self._cfg = Config()

    async def check(self, entry):
        if ((entry.data.get('date') is None) and
            ((any((entry.data.get(key) for key in ('year', 'month', 'day')))
              and not self._cfg.get('prefer_date_or_year', entry, True)) or
             (any((entry.data.get(key) for key in ('month', 'day')))))):
            return [(type(self).NAME, ("The 'date' field is preferred over "
                                       "the 'day/month/year' fields."), "")]

        return []
示例#12
0
class JournalAbbrevChecker(object):
    NAME = 'journal_abbrev'
    FIELDS = ['booktitle', 'journal']

    def __init__(self):
        self._cfg = Config()

    async def check(self, entry):
        problems = []
        acceptable = set(self._cfg.get("acceptable_abbreviations", entry, []))
        for field in JournalAbbrevChecker.FIELDS:
            val = entry.data.get(field, '')
            if contains_abbreviation(val, acceptable=acceptable):
                problems.append(
                    (type(self).NAME,
                     "Publication title '{}' seems to contain an abbreviation".
                     format(val), ""))

        return problems
示例#13
0
class ISBNFormatChecker(object):
    NAME = "isbn_format"

    def __init__(self):
        self._cfg = Config()

    async def check(self, entry):
        fmt = self._cfg.get('isbn_format', entry)
        if not fmt:
            return []

        isbn = entry.data.get('isbn')
        if not isbn:
            return []

        clean_isbn = clean(isbn)
        if not clean_isbn or notisbn(clean_isbn):
            return []

        if fmt not in ('canonical', 'masked'):
            raise ConfigurationError("The option 'isbn_format' must be \
                either of 'canonical' or 'masked'.")

        if fmt == 'canonical':
            cisbn = canonical(clean_isbn)
            if cisbn != isbn:
                return [(type(self).NAME,
                         "ISBN '{}' is not in canonical format.".format(isbn),
                         "Canonical format would be '{}'".format(cisbn))]
        elif fmt == 'masked':
            misbn = mask(clean_isbn)
            if misbn != isbn:
                return [(type(self).NAME,
                         "ISBN '{}' is not in masked format.".format(isbn),
                         "Masked format would be '{}'".format(misbn))]

        return []
示例#14
0
class ISBNLengthChecker(object):
    NAME = "isbn_length"

    def __init__(self):
        self._cfg = Config()

    async def check(self, entry):
        length = self._cfg.get('isbn_length', entry, 13)
        if not length:
            return []

        isbn = entry.data.get('isbn')
        if not isbn:
            return []

        clean_isbn = clean(isbn)
        if not clean_isbn or notisbn(clean_isbn):
            return []

        if length not in (10, 13):
            raise ConfigurationError(
                "The option 'isbn_length' must be either of 10 or 13.")

        if length == 10:
            if not is_isbn10(clean_isbn):
                return [(type(self).NAME,
                         "ISBN '{}' is not of length 10.".format(isbn),
                         "ISBN-10 would be '{}'".format(to_isbn10(clean_isbn)))
                        ]
        elif length == 13:
            if not is_isbn13(clean_isbn):
                return [(type(self).NAME,
                         "ISBN '{}' is not of length 13.".format(isbn),
                         "ISBN-13 would be '{}'".format(to_isbn13(clean_isbn)))
                        ]

        return []
示例#15
0
class GenericStringFormatChecker(object):
    PROVIDE_FIELDS = ['year', 'date']

    def __init__(self):
        self._cfg = Config()
        self._name = type(self).NAME
        self._field = type(self).FIELD
        self._cls = type(self)

    async def check(self, entry):
        problems = []
        formats = self._cfg.get(self._cls.FORMAT_FIELD, entry)
        field_data = entry.data.get(self._field)

        if formats is None or field_data is None:
            return []

        entry_data = {field: str(entry.data.get(field, ''))
                      for field in
                      GenericStringFormatChecker.PROVIDE_FIELDS}

        # special 'short year'
        entry_data['short_year'] = entry_data['year'][-2:]

        if not isinstance(formats, list):
            formats = [formats]

        for form in formats:
            re_str = form.format(**entry_data)
            m = re.match(re_str, field_data)
            if m:
                # Everything A-Okay
                return []

        return [(self._name,
                 f"Format for field {self._field} incorrect.", "")]
示例#16
0
文件: differ.py 项目: tinloaf/bibchex
 def __init__(self, entry):
     self._entry = entry
     self._cfg = Config()
示例#17
0
文件: differ.py 项目: tinloaf/bibchex
class Differ(object):
    FIELD_PROPERTIES = {
        'doi': {
            'case': False
        },
        'title': {
            'first_letter_case': False
        },
        'isbn': {
            'diff_func': isbn_differ
        },
        'issn': {
            'list': True,
            'ignore_order': True
        }
    }

    def __init__(self, entry):
        self._entry = entry
        self._cfg = Config()

    def diff(self, suggestion):
        """Compute and return a list of differences between the
        entity of this Differ and the list of suggestions passed."""
        return self._diff_general(suggestion) + \
            self._diff_people('authors', suggestion) + \
            self._diff_people('editors', suggestion)

    def _diff_people(self, field, suggestion):
        diffs = []

        assert field in ('authors', 'editors')
        if field == 'authors':
            singular = 'Author'
        else:
            singular = 'Editor'

        sugg_field = getattr(suggestion, field)
        entry_field = getattr(self._entry, field)

        if len(sugg_field) == 0:
            return []

        for i in range(0, max(len(sugg_field), len(entry_field))):
            if i >= len(sugg_field):
                diffs.append(
                    Difference(
                        self._entry.get_id(), suggestion.source,
                        '{} {}'.format(singular, i + 1),
                        'Person not present in retrieved \
                               {}: {} {}'.format(field, *entry_field[i])))
                continue

            if i >= len(entry_field):
                diffs.append(
                    Difference(
                        self._entry.get_id(), suggestion.source,
                        '{} {}'.format(singular, i + 1),
                        'Additional person in retrieved \
                               {}: {} {}'.format(field, *sugg_field[i])))
                continue

            entry_first, entry_last = entry_field[i]
            sugg_first, sugg_last = sugg_field[i]

            raw_sugg_first = sugg_first
            raw_sugg_last = sugg_last

            if self._cfg.get('authors_ignore_allcaps', self._entry, True):
                if is_allcaps("{} {}".format(sugg_first, sugg_last)):
                    sugg_first = sugg_first.lower()
                    sugg_last = sugg_last.lower()

                    entry_first = entry_first.lower()
                    entry_last = entry_last.lower()

            difference = False
            if crush_spaces(sugg_last) != crush_spaces(entry_last):
                difference = True

            # Check first names individually
            entry_first_words = entry_first.split(" ")
            sugg_first_words = entry_first.split(" ")
            if len(entry_first_words) != len(sugg_first_words):
                difference = True
            else:
                for (word_e, word_s) in zip(entry_first_words,
                                            sugg_first_words):
                    if not is_initial(word_e) and not is_initial(word_s):
                        difference |= (word_e != word_s)
                    else:
                        difference |= (word_s[0] != word_e[0])

            if difference:
                diffs.append(
                    Difference(
                        self._entry.get_id(), suggestion.source,
                        '{} {}'.format(singular, i + 1), 'Suggested {} name: \
                               {} {}'.format(singular.lower(), raw_sugg_first,
                                             raw_sugg_last)))
        return diffs

    def _diff_general(self, suggestion):
        diffs = []

        # Find fields where we have data in the entry, which is different from
        # the data in the suggestion
        for field in self._entry.data.keys():
            if field in suggestion.data:
                suggestion_data = suggestion.data[field]

                entry_data = unlatexify(self._entry.data[field])

                # Cast everything to string
                suggestion_data = [(str(d), kind)
                                   for (d, kind) in suggestion_data]

                # Unify hyphens
                entry_data = unify_hyphens(entry_data)
                suggestion_data = [(unify_hyphens(d), kind)
                                   for (d, kind) in suggestion_data]

                # Crush spaces
                entry_data = crush_spaces(entry_data)
                suggestion_data = [(crush_spaces(d), kind)
                                   for (d, kind) in suggestion_data]

                field_props = Differ.FIELD_PROPERTIES.get(field, {})

                if not field_props.get('case', True):
                    entry_data = entry_data.lower()
                    suggestion_data = [(d.lower(), kind)
                                       for (d, kind) in suggestion_data]

                if not field_props.get('first_letter_case', True):
                    entry_data = lower_case_first_letters(entry_data)
                    suggestion_data = [(lower_case_first_letters(d), kind)
                                       for (d, kind) in suggestion_data]

                if 'diff_func' in field_props:
                    # Diff func must handle plain / re on itself!
                    if field_props['diff_func'](entry_data, suggestion_data):
                        diffs.append(
                            Difference(
                                self._entry.get_id(), suggestion.source, field,
                                [d for (d, kind) in suggestion.data[field]]))
                elif not field_props.get('list', False):
                    hit = False
                    for (d, kind) in suggestion_data:
                        if kind == Suggestion.KIND_RE:
                            if re.match(d, entry_data):
                                hit = True
                        else:
                            # Plain
                            hit |= (entry_data == d)

                    if not hit:
                        diffs.append(
                            Difference(
                                self._entry.get_id(), suggestion.source, field,
                                [d for (d, kind) in suggestion.data[field]]))
                else:
                    # List comparison
                    entry_list = [e.strip() for e in entry_data.split(',')]
                    hit = False

                    for (d, kind) in suggestion_data:
                        if kind == Suggestion.KIND_RE:
                            LOGGER.error(
                                "List-of-regex is not supported as suggestion."
                            )
                            continue

                        sugg_list = [e.strip() for e in d.split(',')]

                        if field_props.get('ignore_order', False):
                            hit |= (set(entry_list) == set(sugg_list))
                        else:
                            hit |= entry_list == sugg_list

                    if not hit:
                        diffs.append(
                            Difference(
                                self._entry.get_id(), suggestion.source, field,
                                [d for (d, kind) in suggestion.data[field]]))

        # Find fields in the 'wanted' option for which the suggestion has data,
        # but the entry has not.
        wanted = set(self._cfg.get('wanted', self._entry, []))
        forbidden = set(self._cfg.get('forbidden', self._entry, []))
        wanted = wanted - forbidden
        for field in wanted:
            if field not in self._entry.data and field in suggestion.data:
                diffs.append(
                    Difference(self._entry.get_id(), suggestion.source, field,
                               [d for (d, kind) in suggestion.data[field]]))

        return diffs
示例#18
0
class CrossrefSource(object):
    QUERY_FIELDS = ['doi']
    DOI_URL_RE = re.compile(r'https?://(dx\.)?doi\.org/.*')

    def __init__(self, ui):
        self._ui = ui
        self._cfg = Config()

        # Check if we have crossref credentials and set them via environment
        # variable. The environment variables are read by crossref_commons
        if self._cfg.get('crossref_plus'):
            LOGGER.info("Setting Crossref Plus token")
            os.environ['CR_API_PLUS'] = self._cfg.get('crossref_plus')
        if (self._cfg.get('crossref_mailto')
                and len(self._cfg.get('crossref_mailto')) > 0):
            # TODO make version dynamic
            os.environ['CR_API_AGENT'] = \
                ('BibChex/0.1 '
                 '(https://github.com/tinloaf/bibchex; mailto:{})').format(
                     self._cfg.get('crossref_mailto'))
            os.environ['CR_API_MAILTO'] = self._cfg.get('crossref_mailto')
        else:
            LOGGER.warning(
                ("\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
                 "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
                 " Please set crossref_mailto in your config! \n"
                 " Not setting crossref_mailto may cause all your CrossRef"
                 " requests to fail."
                 "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
                 "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"))
            os.environ['CR_API_AGENT'] = \
                'BibChex/0.1 (https://github.com/tinloaf/bibchex)'

    def _get_doi_blocking(self, entry, step):
        """
        Steps:
           1: query by title + authors (first and last names)
           2: query by title + authors (only last names)
           3: query by title
        """
        title = entry.data.get('title')
        if title is None:
            # Without a title, we're chanceless.
            return None

        q = [('bibliographic', title)]

        if step in (1, 2):
            for (first, last) in entry.authors:
                if step == 2:
                    q.append(('author', "{}".format(last)))
                else:
                    q.append(('author', "{} {}".format(first, last)))

        try:
            (count, results) = crossref_commons.search.search_publication(
                q, sort="relevance", order="desc")
        except Exception as e:
            LOGGER.error((f"Error reverse-searching for {entry.get_id()}: "
                          f"{e}"))
            return None

        if count > 0 and results:
            for i in range(0, min(10, count)):
                if 'title' not in results[i] or 'DOI' not in results[i]:
                    # Bogus data
                    continue
                suggested_title = results[i]['title']
                doi = results[i]['DOI']

                if not isinstance(suggested_title, list):
                    suggested_title = [suggested_title]
                for possibility in suggested_title:
                    fuzz_score = fuzz.partial_ratio(title.lower(),
                                                    possibility.lower())
                    if fuzz_score >= self._cfg.get('doi_fuzzy_threshold',
                                                   entry, 90):
                        return doi

        return None

    async def get_doi(self, entry):
        loop = asyncio.get_event_loop()
        done = False
        retries = 20
        backoff = 1
        problem = None
        result = None
        step = 1
        self._ui.increase_subtask('CrossrefDOI')
        try:
            while not done:
                try:
                    result = await loop.run_in_executor(
                        None, partial(self._get_doi_blocking, entry, step))
                    if result:
                        done = True
                    else:
                        # Too specific search? Loosen search terms
                        if step < 3:
                            step += 1
                        else:
                            done = True
                except RateLimitException:
                    if retries == 0:
                        return (None, RetrievalProblem("Too many retries"))
                    await asyncio.sleep(backoff)
                    backoff = backoff * 2
                    retries -= 1
                    done = False

        except RetrievalProblem as e:
            problem = e

        self._ui.finish_subtask('CrossrefDOI')

        return (result, problem)

    async def query(self, entry):
        loop = asyncio.get_event_loop()
        done = False
        retries = 20
        backoff = 1
        problem = None
        result = None
        self._ui.increase_subtask('CrossrefQuery')
        try:
            while not done:
                try:
                    result = await loop.run_in_executor(
                        None, partial(self._query_blocking, entry))
                    done = True
                except RateLimitException:
                    if retries == 0:
                        return (None, RetrievalProblem("Too many retries"))
                    await asyncio.sleep(backoff)
                    backoff = backoff * 2
                    retries -= 1
                    done = False

        except RetrievalProblem as e:
            problem = e

        return (result, problem)

    def _query_blocking(self, entry):
        doi = entry.get_probable_doi()
        if not doi:
            self._ui.finish_subtask('CrossrefQuery')
            return None

        try:
            data = crossref_commons.retrieval.get_publication_as_json(doi)
        except ValueError as e:
            self._ui.finish_subtask('CrossrefQuery')
            if str(e) == f"DOI {doi} does not exist":
                # This isn't really an error, CrossRef just does not know
                # about them
                pass
            else:
                LOGGER.error((f"Error retrieving data for {entry.get_id()}. "
                              f"{e}"))
            return None
        except ConnectionError as e:
            # TODO retry?
            self._ui.finish_subtask('CrossrefQuery')
            LOGGER.error(
                (f"Connection error retrieving data for {entry.get_id()}. "
                 f"{e}"))
            return None

        s = Suggestion("crossref", entry)

        # Special handling for type
        btype = TYPE_MAPPING.get(data['type'])
        if not btype:
            LOGGER.warn(
                "Type {} not found in crossref source. (Entry {})".format(
                    data['type'], entry.get_id()))
        else:
            s.add_field('entrytype', btype)

        # Special handling for authors
        for author_data in data.get('author', []):
            s.add_author(
                author_data.get('given', "").strip(),
                author_data.get('family', "").strip())

        # Special handling for editors
        for editor_data in data.get('editor', []):
            s.add_editor(
                editor_data.get('given', "").strip(),
                editor_data.get('family', "").strip())

        # Special handling for journal / book title
        if btype in ['journal-article', 'book-chapter']:
            journal = flexistrip(data.get('container-title'))
            if journal:
                s.add_field('journal', journal)

        # Special handling for URL. Only take it if it's not a DOI-Url
        url = flexistrip(data.get('URL'))
        if url and (CrossrefSource.DOI_URL_RE.match(url) is None):
            s.add_field('url', url)

        # All other fields
        for field_from, field_to in FIELD_MAPPING.items():
            if isinstance(field_to, dict):
                if entry.data['entrytype'] in field_to:
                    field_to = field_to[entry.data['entrytype']]
                else:
                    field_to = field_to.get('default')

            if not field_to:
                continue

            if field_from in data:
                s.add_field(field_to, flexistrip(data[field_from]))

        self._ui.finish_subtask('CrossrefQuery')
        return s
示例#19
0
class Checker(object):
    def __init__(self, filename, out_filename):
        self._fname = filename
        self._out_filename = out_filename

        self._bibtex_data = None
        self._entries = {}
        self._suggestions = {}

        self._retrieval_errors = []
        self._diffs = []
        self._problems = []
        self._global_problems = []

        self._unifier = Unifier()

        self._ui = UI()
        self._cfg = Config()

    async def run(self):
        LOGGER.info("Parsing BibTeX")
        self._parse()
        LOGGER.info("Applying unification rules")
        self._unify()
        LOGGER.info("Retrieving missing DOIs")
        await self._find_dois()
        LOGGER.info("Retrieving metadata")
        await self._retrieve()
        LOGGER.info("Calculating differences")
        self._diff()
        LOGGER.info("Running consistency checks")
        await self._check_consistency()
        # TODO Retrieval Errors should be part of the HTML output

        self._filter_diffs()
        self._filter_problems()

        LOGGER.info("Writing output")
        self._output()
        LOGGER.info("Done.")

    def _filter_diffs(self):
        filtered_diffs = [diff for diff in self._diffs
                          if not self._entries[diff.entry_id]
                          .should_ignore_diff(diff.source, diff.field)]

        self._diffs = filtered_diffs

    def _filter_problems(self):
        filtered_probs = [prob for prob in self._problems
                          if not self._entries[prob.entry_id]
                          .should_ignore_problem(prob.problem_type)]

        self._problems = filtered_probs

    def _output(self):
        html_out = HTMLOutput(list(self._entries.values()), self._diffs,
                              self._problems, self._global_problems,
                              self._fname)
        html_out.write(self._out_filename)

    def _print_retrieval_errors(self):
        LOGGER.warn("############################################")
        LOGGER.warn("##    Errors occurred during retrieval    ##")
        LOGGER.warn("############################################")

        for p in self._retrieval_errors:
            LOGGER.warn("main", " - {}".format(p))

    def _diff(self):
        for (_, entry) in self._entries.items():
            d = Differ(entry)
            for s in self._suggestions.get(entry.get_id(), []):
                self._diffs.extend(d.diff(s))

    async def _check_consistency(self):
        tasks = []
        task_info = []
        for CChecker in CCHECKERS:
            if hasattr(CChecker, 'reset'):
                await CChecker.reset()

        for CChecker in CCHECKERS:
            for entry in self._entries.values():
                ccheck = CChecker()
                if self._cfg.get("check_{}".format(CChecker.NAME), entry, True):
                    task = ccheck.check(entry)
                    task_info.append((CChecker, entry))
                    tasks.append(task)

        results = await asyncio.gather(*tasks)
        for ((CChecker, entry), problems) in zip(task_info, results):
            for (problem_type, message, details) in problems:
                self._problems.append(
                    Problem(entry.get_id(), CChecker.NAME, problem_type,
                            message, details))

        for CChecker in CCHECKERS:
            if hasattr(CChecker, 'complete'):
                global_results = await CChecker.complete(self._ui)
                for (problem_type, message, details) in global_results:
                    self._global_problems.append(
                        Problem(None, CChecker.NAME, problem_type,
                                message, details))

    async def _find_dois(self):
        cs = CrossrefSource(self._ui)

        entry_order = (entry for entry in self._entries.values()
                       if entry.get_doi() is None)

        # Filter out entries for which bibchex-nodoi is set.
        entry_order = list(
            filter(lambda e: not e.options.get('nodoi', False), entry_order))

        tasks = []
        for entry in entry_order:
            task = cs.get_doi(entry)
            tasks.append(task)

        results = await asyncio.gather(*tasks)
        for (entry, (result, retrieval_error)) in zip(entry_order, results):
            if result:
                entry.add_suggested_doi(result)
            if retrieval_error:
                self._retrieval_errors.append(retrieval_error)

    async def _retrieve(self):
        entry_order = []
        tasks = []
        indices = []

        for SourceClass in SOURCES:
            #        for SourceClass in [ DataCiteSource ]:
            source = SourceClass(self._ui)

            i = 0
            for entry in self._entries.values():
                task = source.query(entry)
                entry_order.append(entry)
                tasks.append(task)
                indices.append(i)
                i += 1

        results = await asyncio.gather(*tasks)
        for (entry_index, raw_result) in zip(indices, results):
            entry = entry_order[entry_index]

            if not isinstance(raw_result, list):
                raw_result = [raw_result]

            for (result, retrieval_error) in raw_result:
                if result:
                    # Unify all suggested data
                    self._unifier.unify_suggestion(result)
                    self._suggestions[entry.get_id()].append(result)
                if retrieval_error:
                    if isinstance(retrieval_error, list):
                        self._retrieval_errors.extend(retrieval_error)
                    else:
                        self._retrieval_errors.append(retrieval_error)

    def _unify(self):
        for entry in self._entries.values():
            assert entry.get_id() not in self._suggestions
            self._suggestions[entry.get_id()] = [
                self._unifier.unify_entry(entry)
            ]
                        
    def _parse(self):
        with open(self._fname) as bibtex_file:
            parser = bibtexparser.bparser.BibTexParser(
                common_strings=True)
            # TODO how much of my own magic ist still necessary here?
#            parser.customization = bibtexparser.customization.\
#                homogenize_latex_encoding
            self._bibtex_data = parser.parse_file(bibtex_file)
        entry_list = [Entry(bentry, self._ui)
                      for bentry in self._bibtex_data.entries]
        entry_keys = set((entry.get_id() for entry in entry_list))
        if len(entry_keys) != len(entry_list):
            LOGGER.error("ERROR! Duplicate keys detected!")
            sys.exit(-1)

        self._entries = {entry.get_id(): entry for entry in entry_list}
示例#20
0
文件: title.py 项目: tinloaf/bibchex
 def __init__(self):
     self._cfg = Config()
示例#21
0
 def __init__(self):
     self._cfg = Config()
     self._name = type(self).NAME
     self._field = type(self).FIELD
     self._cls = type(self)