def bibclassify_exhaustive_call(text_files, taxonomy, rebuild_cache=False, no_cache=False, output_mode='text', output_limit=20, spires=False, match_mode='full', with_author_keywords=False, extract_acronyms=False, only_core_tags=False): """Call to bibclassify on a file.""" output_mode = output_mode.split(",") return engine.get_keywords_from_local_file( local_file=text_files, taxonomy_name=taxonomy, rebuild_cache=rebuild_cache, no_cache=no_cache, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, extract_acronyms=extract_acronyms, only_core_tags=only_core_tags, api=True)
def bibclassify_exhaustive_call(text_files, taxonomy, rebuild_cache=False, no_cache=False, output_mode='text', output_limit=20, spires=False, match_mode='full', with_author_keywords=False, extract_acronyms=False, only_core_tags=False): output_mode = output_mode.split(",") return engine.get_keywords_from_local_file(local_file=text_files, taxonomy_name=taxonomy, rebuild_cache=rebuild_cache, no_cache=no_cache, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, extract_acronyms=extract_acronyms, only_core_tags=only_core_tags, api=True)
def _analyze_documents(records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER): """For each collection, parse the documents attached to the records in collection with the corresponding taxonomy_name. @var records: list of recids to process @var taxonomy_name: str, name of the taxonomy, e.g. HEP @var collection: str, collection name @keyword output_limit: int, max number of keywords to extract [3] @return: str, marcxml output format of results """ global _INDEX if not records: # No records could be found. bibtask.write_message( "WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs( record).list_latest_files() # TODO: why this doesn't call list_all_files() ? keywords = {} akws = {} acro = {} single_keywords = composite_keywords = author_keywords = acronyms = None for doc in bibdocfiles: # Get the keywords for all PDF documents contained in the record. if text_extractor.is_pdf(doc.get_full_path()): bibtask.write_message( 'INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_path() single_keywords, composite_keywords, author_keywords, acronyms = \ engine.get_keywords_from_local_file(fulltext, taxonomy_name, with_author_keywords=True, output_mode="raw", output_limit=output_limit, match_mode='partial') else: bibtask.write_message('WARNING: BibClassify does not know how to process \ doc: %s (type: %s) -- ignoring it.' % (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3) if single_keywords or composite_keywords: cleaned_single = engine.clean_before_output(single_keywords) cleaned_composite = engine.clean_before_output( composite_keywords) # merge the groups into one keywords.update(cleaned_single) keywords.update(cleaned_composite) acro.update(acronyms) akws.update(author_keywords) if len(keywords): output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) output.append(engine._output_marc(keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT)) output.append('</record>') else: bibtask.write_message('WARNING: No keywords found for record %d.' % record, stream=sys.stderr, verbose=0) _INDEX += 1 bibtask.task_update_progress( 'Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) bibtask.task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
def _analyze_documents( records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER): """For each collection, parse the documents attached to the records in collection with the corresponding taxonomy_name. @var records: list of recids to process @var taxonomy_name: str, name of the taxonomy, e.g. HEP @var collection: str, collection name @keyword output_limit: int, max number of keywords to extract [3] @return: str, marcxml output format of results """ global _INDEX if not records: # No records could be found. bibtask.write_message( "WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files( ) # TODO: why this doesn't call list_all_files() ? keywords = {} akws = {} acro = {} single_keywords = composite_keywords = author_keywords = acronyms = None for doc in bibdocfiles: # Get the keywords for all PDF documents contained in the record. if text_extractor.is_pdf(doc.get_full_path()): bibtask.write_message( 'INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_path() single_keywords, composite_keywords, author_keywords, acronyms = \ engine.get_keywords_from_local_file(fulltext, taxonomy_name, with_author_keywords=True, output_mode="raw", output_limit=output_limit, match_mode='partial') else: bibtask.write_message( 'WARNING: BibClassify does not know how to process \ doc: %s (type: %s) -- ignoring it.' % (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3) if single_keywords or composite_keywords: cleaned_single = engine.clean_before_output(single_keywords) cleaned_composite = engine.clean_before_output( composite_keywords) # merge the groups into one keywords.update(cleaned_single) keywords.update(cleaned_composite) acro.update(acronyms) akws.update(author_keywords) if len(keywords): output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) output.append( engine._output_marc(keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT)) output.append('</record>') else: bibtask.write_message('WARNING: No keywords found for record %d.' % record, stream=sys.stderr, verbose=0) _INDEX += 1 bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) bibtask.task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)