예제 #1
0
def bibclassify_exhaustive_call(text_files,
                                taxonomy,
                                rebuild_cache=False,
                                no_cache=False,
                                output_mode='text',
                                output_limit=20,
                                spires=False,
                                match_mode='full',
                                with_author_keywords=False,
                                extract_acronyms=False,
                                only_core_tags=False):
    """Call to bibclassify on a file."""
    output_mode = output_mode.split(",")

    return engine.get_keywords_from_local_file(
        local_file=text_files,
        taxonomy_name=taxonomy,
        rebuild_cache=rebuild_cache,
        no_cache=no_cache,
        output_mode=output_mode,
        output_limit=output_limit,
        spires=spires,
        match_mode=match_mode,
        with_author_keywords=with_author_keywords,
        extract_acronyms=extract_acronyms,
        only_core_tags=only_core_tags,
        api=True)
예제 #2
0
파일: api.py 프로젝트: mhellmic/b2share
def bibclassify_exhaustive_call(text_files, taxonomy, rebuild_cache=False, no_cache=False, output_mode='text',
                                output_limit=20, spires=False, match_mode='full', with_author_keywords=False,
                                extract_acronyms=False, only_core_tags=False):

    output_mode = output_mode.split(",")
    return engine.get_keywords_from_local_file(local_file=text_files,
                                               taxonomy_name=taxonomy,
                                               rebuild_cache=rebuild_cache,
                                               no_cache=no_cache,
                                               output_mode=output_mode,
                                               output_limit=output_limit,
                                               spires=spires,
                                               match_mode=match_mode,
                                               with_author_keywords=with_author_keywords,
                                               extract_acronyms=extract_acronyms,
                                               only_core_tags=only_core_tags,
                                               api=True)
예제 #3
0
파일: daemon.py 프로젝트: dset0x/invenio
def _analyze_documents(records, taxonomy_name, collection,
                       output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." %
            collection, stream=sys.stderr, verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(
            record).list_latest_files()  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message(
                    'INFO: Generating keywords for record %d.' %
                    record, stream=sys.stderr, verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    engine.get_keywords_from_local_file(fulltext,
                                                        taxonomy_name,
                                                        with_author_keywords=True,
                                                        output_mode="raw",
                                                        output_limit=output_limit,
                                                        match_mode='partial')
            else:
                bibtask.write_message('WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                                      (doc.fullpath, doc.doctype),
                                      stream=sys.stderr, verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = engine.clean_before_output(single_keywords)
                cleaned_composite = engine.clean_before_output(
                    composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(engine._output_marc(keywords.items(), (), akws, acro,
                                              spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                                  record, stream=sys.stderr, verbose=0)

        _INDEX += 1

        bibtask.task_update_progress(
            'Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
예제 #4
0
def _analyze_documents(
        records,
        taxonomy_name,
        collection,
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." % collection,
            stream=sys.stderr,
            verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files(
        )  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message(
                    'INFO: Generating keywords for record %d.' % record,
                    stream=sys.stderr,
                    verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    engine.get_keywords_from_local_file(fulltext,
                                                        taxonomy_name,
                                                        with_author_keywords=True,
                                                        output_mode="raw",
                                                        output_limit=output_limit,
                                                        match_mode='partial')
            else:
                bibtask.write_message(
                    'WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype),
                    stream=sys.stderr,
                    verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = engine.clean_before_output(single_keywords)
                cleaned_composite = engine.clean_before_output(
                    composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(
                engine._output_marc(keywords.items(), (),
                                    akws,
                                    acro,
                                    spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                                  record,
                                  stream=sys.stderr,
                                  verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' %
                                     (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)