Пример #1
0
def query_crossref_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Crossref.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)
    json_entries = []
    rejected = []

    # For each pdf in the folder
    import pdb
    pdb.set_trace()
    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Crossref
        rbib, rjson, score = providers.crossref_query(authors, title)
        if score >= config.crossref_accept_threshold:
            # Append filename and store entry
            rbib['file'] = utils.encode_filename_field(file)
            json_entries.append(rjson)
            db.entries.append(rbib)
        else:
            rejected.append(os.path.basename(file))

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False),
                            use_backup)
    json_path = os.path.join(folder, '.queried.json')
    json_str = json.dumps(json_entries,
                          sort_keys=True,
                          indent=4,
                          separators=(',', ': '))
    utils.write_with_backup(json_path, json_str, use_backup)
    rejected_path = os.path.join(folder, '.rejected.txt')
    if len(rejected) > 0:
        utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
Пример #2
0
def query_crossref_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Crossref.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)
    json_entries = []
    rejected = []

    # For each pdf in the folder
    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Crossref
        rbib, rjson, score = providers.crossref_query(authors, title)
        if score >= config.crossref_accept_threshold:
            # Append filename and store entry
            rbib['file'] = utils.encode_filename_field(file)
            json_entries.append(rjson)
            db.entries.append(rbib)
        else:
            rejected.append(os.path.basename(file))

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
    json_path = os.path.join(folder, '.queried.json')
    json_str = json.dumps(json_entries, sort_keys=True, indent=4, separators=(',', ': '))
    utils.write_with_backup(json_path, json_str, use_backup)
    rejected_path = os.path.join(folder, '.rejected.txt')
    if len(rejected) > 0:
        utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
Пример #3
0
def sync_folder(folder, use_backup):
    """
    Update the file field of bibtex entries for the given folder.
    When an entry could not find a good match, it will be removed from the
    bibtex, unless the user explicitly prevents it.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but updates `.queried.bib` and `biblio.bib` files.
    """
    for bib_file in ('.queried.bib', 'biblio.bib'):
        bib_path = os.path.join(folder, bib_file)
        db = utils.read_bib_file(bib_path)
        unmatched = set(
            [os.path.basename(f) for f in utils.get_pdf_list(folder)])
        to_delete = []
        for i, entry in enumerate(db.entries):
            guess = nomenclature.gen_filename(entry)
            if 'file' in entry:
                guess = utils.decode_filename_field(entry['file'])
            match, score = utils.most_similar_filename(guess, unmatched)
            if score >= 0.90:
                unmatched.remove(match)
                entry['file'] = utils.encode_filename_field(match)
            else:
                print(
                    termcolor.colored(bib_file, "magenta") +
                    ": ({1}) will remove '{0}'".format(
                        guess, termcolor.colored(score, "yellow")))
                to_delete.append(i)

        # Delete unmatched entries
        if to_delete:
            cmd = input('(Y/n) ')
            if cmd == '' or cmd == 'y' or cmd == 'Y':
                for i in sorted(to_delete, reverse=True):
                    del db.entries[i]

        # Write synced database
        utils.write_with_backup(bib_path, utils.write_bib(db, order=False),
                                use_backup)
Пример #4
0
def sync_folder(folder, use_backup):
    """
    Update the file field of bibtex entries for the given folder.
    When an entry could not find a good match, it will be removed from the
    bibtex, unless the user explicitly prevents it.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but updates `.queried.bib` and `biblio.bib` files.
    """
    for bib_file in ('.queried.bib', 'biblio.bib'):
        bib_path = os.path.join(folder, bib_file)
        db = utils.read_bib_file(bib_path)
        unmatched = set([os.path.basename(f) for f in utils.get_pdf_list(folder)])
        to_delete = []
        for i, entry in enumerate(db.entries):
            guess = nomenclature.gen_filename(entry)
            if 'file' in entry:
                guess = utils.decode_filename_field(entry['file'])
            match, score = utils.most_similar_filename(guess, unmatched)
            if score >= 0.90:
                unmatched.remove(match)
                entry['file'] = utils.encode_filename_field(match)
            else:
                print(termcolor.colored(bib_file, "magenta") +
                      ": ({1}) will remove '{0}'".format(guess, termcolor.colored(score, "yellow")))
                to_delete.append(i)

        # Delete unmatched entries
        if to_delete:
            cmd = input('(Y/n) ')
            if cmd == '' or cmd == 'y' or cmd == 'Y':
                for i in sorted(to_delete, reverse=True):
                    del db.entries[i]

        # Write synced database
        utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
Пример #5
0
def query_google_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Google Scholar.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)

    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Google Scholar
        rbib = providers.scholarly_query(authors, title)
        if rbib is None:
            continue

        # Append filename and store entry
        rbib['file'] = utils.encode_filename_field(file)
        db.entries.append(rbib)

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False),
                            use_backup)
Пример #6
0
def query_google_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Google Scholar.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)

    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Google Scholar
        rbib = providers.scholarly_query(authors, title)
        if rbib is None:
            continue

        # Append filename and store entry
        rbib['file'] = utils.encode_filename_field(file)
        db.entries.append(rbib)

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)