def generate_keywords(req, recid, store_keywords=True): req.write( "Please be patient while the keywords classification is running...") bibdocfiles = BibRecDocs(recid).list_latest_files() keywords = [] for doc in bibdocfiles: # Get the keywords for each PDF document contained in the record. if is_pdf(doc.get_full_path()): fulltext = doc.get_full_path() from invenio.bibclassify_engine import get_keywords_from_local_file single_keywords, composite_keywords = get_keywords_from_local_file( fulltext, taxonomy='HEP', with_author_keywords=True) for keyword, spans in single_keywords.items(): keywords.append([keyword.concept, len(spans)]) for keyword, num, components in composite_keywords: keywords.append([keyword.concept, num]) if keywords and store_keywords: output = [ '<collection><record>\n' '<controlfield tag="001">%s</controlfield>' % recid ] output.append( output_marc(single_keywords, composite_keywords, spires=False, taxonomy='HEP')) output.append('</record></collection>') tmp_directory = "%s/bibclassify" % CFG_TMPDIR filename = "bibclassifyd_%s.xml" % time.strftime( "%Y%m%d%H%M%S", time.localtime()) abs_path = os.path.join(tmp_directory, filename) if not os.path.isdir(tmp_directory): os.mkdir(tmp_directory) file_desc = open(abs_path, "w") file_desc.write('\n'.join(output)) file_desc.close() #cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path) #os.system(cmd) return keywords
def _analyze_documents(records, ontology, collection): """For each collection, parse the documents attached to the records in collection with the corresponding ontology.""" global _INDEX if not records: # No records could be found. write_message("WARNING: No record were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files() output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) for doc in bibdocfiles: # Get the keywords for each PDF document contained in the record. if is_pdf(doc.get_full_path()): write_message('INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_full_path() output.append( output_keywords_for_local_file( fulltext, taxonomy=ontology, output_mode="marcxml", output_limit=3, match_mode="partial", with_author_keywords=True, verbose=task_get_option('verbose'))) _INDEX += 1 output.append('</record>') task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
def generate_keywords(req, recid, store_keywords=True): req.write("Please be patient while the keywords classification is running...") bibdocfiles = BibRecDocs(recid).list_latest_files() keywords = [] for doc in bibdocfiles: # Get the keywords for each PDF document contained in the record. if is_pdf(doc.get_full_path()): fulltext = doc.get_full_path() from invenio.bibclassify_engine import get_keywords_from_local_file single_keywords, composite_keywords = get_keywords_from_local_file(fulltext, taxonomy='HEP', with_author_keywords=True) for keyword, spans in single_keywords.items(): keywords.append([keyword.concept, len(spans)]) for keyword, num, components in composite_keywords: keywords.append([keyword.concept, num]) if keywords and store_keywords: output = ['<collection><record>\n' '<controlfield tag="001">%s</controlfield>' % recid] output.append(output_marc(single_keywords, composite_keywords, spires=False, taxonomy='HEP')) output.append('</record></collection>') tmp_directory = "%s/bibclassify" % CFG_TMPDIR filename = "bibclassifyd_%s.xml" % time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = os.path.join(tmp_directory, filename) if not os.path.isdir(tmp_directory): os.mkdir(tmp_directory) file_desc = open(abs_path, "w") file_desc.write('\n'.join(output)) file_desc.close() #cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path) #os.system(cmd) return keywords
def _analyze_documents(records, ontology, collection): """For each collection, parse the documents attached to the records in collection with the corresponding ontology.""" global _INDEX if not records: # No records could be found. write_message("WARNING: No record were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files() output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) for doc in bibdocfiles: # Get the keywords for each PDF document contained in the record. if is_pdf(doc.get_full_path()): write_message('INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_full_path() output.append(output_keywords_for_local_file(fulltext, taxonomy=ontology, output_mode="marcxml", output_limit=3, match_mode="partial", with_author_keywords=True, verbose=task_get_option('verbose'))) _INDEX += 1 output.append('</record>') task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
def _analyze_documents( records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER): """For each collection, parse the documents attached to the records in collection with the corresponding taxonomy_name. @var records: list of recids to process @var taxonomy_name: str, name of the taxonomy, e.g. HEP @var collection: str, collection name @keyword output_limit: int, max number of keywords to extract [3] @return: str, marcxml output format of results """ global _INDEX if not records: # No records could be found. bibtask.write_message( "WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files( ) # TODO: why this doesn't call list_all_files() ? keywords = {} akws = {} acro = {} single_keywords = composite_keywords = author_keywords = acronyms = None for doc in bibdocfiles: # Get the keywords for all PDF documents contained in the record. if bibclassify_text_extractor.is_pdf(doc.get_full_path()): bibtask.write_message( 'INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_path() single_keywords, composite_keywords, author_keywords, acronyms = \ bibclassify_engine.get_keywords_from_local_file(fulltext, taxonomy_name, with_author_keywords=True, output_mode="raw", output_limit=output_limit, match_mode='partial') else: bibtask.write_message( 'WARNING: BibClassify does not know how to process \ doc: %s (type: %s) -- ignoring it.' % (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3) if single_keywords or composite_keywords: cleaned_single = bibclassify_engine.clean_before_output( single_keywords) cleaned_composite = bibclassify_engine.clean_before_output( composite_keywords) # merge the groups into one keywords.update(cleaned_single) keywords.update(cleaned_composite) acro.update(acronyms) akws.update(author_keywords) if len(keywords): output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) output.append( bibclassify_engine._output_marc( keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT)) output.append('</record>') else: bibtask.write_message('WARNING: No keywords found for record %d.' % record, stream=sys.stderr, verbose=0) _INDEX += 1 bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) bibtask.task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
def _analyze_documents(records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER): """For each collection, parse the documents attached to the records in collection with the corresponding taxonomy_name. @var records: list of recids to process @var taxonomy_name: str, name of the taxonomy, e.g. HEP @var collection: str, collection name @keyword output_limit: int, max number of keywords to extract [3] @return: str, marcxml output format of results """ global _INDEX if not records: # No records could be found. bibtask.write_message( "WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2 ) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files() # TODO: why this doesn't call list_all_files() ? keywords = {} akws = {} acro = {} single_keywords = composite_keywords = author_keywords = acronyms = None for doc in bibdocfiles: # Get the keywords for all PDF documents contained in the record. if bibclassify_text_extractor.is_pdf(doc.get_full_path()): bibtask.write_message("INFO: Generating keywords for record %d." % record, stream=sys.stderr, verbose=3) fulltext = doc.get_path() single_keywords, composite_keywords, author_keywords, acronyms = bibclassify_engine.get_keywords_from_local_file( fulltext, taxonomy_name, with_author_keywords=True, output_mode="raw", output_limit=output_limit, match_mode="partial", ) else: bibtask.write_message( "WARNING: BibClassify does not know how to process \ doc: %s (type: %s) -- ignoring it." % (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3, ) if single_keywords or composite_keywords: cleaned_single = bibclassify_engine.clean_before_output(single_keywords) cleaned_composite = bibclassify_engine.clean_before_output(composite_keywords) # merge the groups into one keywords.update(cleaned_single) keywords.update(cleaned_composite) acro.update(acronyms) akws.update(author_keywords) if len(keywords): output.append("<record>") output.append('<controlfield tag="001">%s</controlfield>' % record) output.append( bibclassify_engine._output_marc(keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT) ) output.append("</record>") else: bibtask.write_message("WARNING: No keywords found for record %d." % record, stream=sys.stderr, verbose=0) _INDEX += 1 bibtask.task_update_progress("Done %d out of %d." % (_INDEX, _RECIDS_NUMBER)) bibtask.task_sleep_now_if_required(can_stop_too=False) return "\n".join(output)