def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Outputs the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print "Input file: %s" % source output = get_keywords_from_text(text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) print output # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url(entry, user_agent=bconfig.CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] process_lines()
def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False, match_mode="full", no_cache=False, with_author_keywords=False): text_lines = text_lines_from_local_file(local_file) global _SKWS global _CKWS if not _SKWS: if taxonomy is not None: _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) else: write_message("ERROR: Please specify an ontology in order to " "extract keywords.", stream=sys.stderr, verbose=1) text_lines = cut_references(text_lines) fulltext = normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = get_single_keywords(_SKWS, fulltext) composite_keywords = get_composite_keywords(_CKWS, fulltext, single_keywords) return (single_keywords, composite_keywords)
def get_keywords_from_local_file( local_file, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Outputs keywords reading a local file. Arguments and output are the same as for @see: get_keywords_from_text() """ log.info("Analyzing keywords for local file %s." % local_file) text_lines = extractor.text_lines_from_local_file(local_file) return get_keywords_from_text(text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms)
def output_keywords_for_local_file( local_file, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None): """Outputs the keywords for a local file.""" if verbose is not None: set_verbose_level(verbose) write_message("INFO: Analyzing keywords for local file %s." % local_file, stream=sys.stderr, verbose=3) text_lines = text_lines_from_local_file(local_file) return get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, taxonomy=taxonomy, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, no_cache=no_cache)
def output_keywords_for_local_file(local_file, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None): """Outputs the keywords for a local file.""" if verbose is not None: set_verbose_level(verbose) write_message("INFO: Analyzing keywords for local file %s." % local_file, stream=sys.stderr, verbose=3) text_lines = text_lines_from_local_file(local_file) return get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, taxonomy=taxonomy, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, no_cache=no_cache)
def get_keywords_from_local_file(local_file, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs ): """Outputs keywords reading a local file. Arguments and output are the same as for @see: get_keywords_from_text() """ log.info("Analyzing keywords for local file %s." % local_file) text_lines = extractor.text_lines_from_local_file(local_file) return get_keywords_from_text(text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms)
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None, only_core_tags=False, extract_acronyms=False): """Outputs the keywords for each source in sources.""" if verbose is not None: set_verbose_level(verbose) # Initialize cache global _SKWS global _CKWS _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) # Get the fulltext for each source. for entry in input_sources: write_message("INFO: Trying input file %s." % entry, stream=sys.stderr, verbose=3) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if os.path.isfile(entry + filename): text_lines = text_lines_from_local_file(entry + filename) if text_lines: source = filename elif os.path.isfile(entry): text_lines = text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) else: # Treat as a URL. text_lines = text_lines_from_url(entry, user_agent=CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] if source: if output_mode == "text": print "Input file: %s" % source keywords = get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, only_core_tags=only_core_tags) if extract_acronyms: acronyms = get_acronyms("\n".join(text_lines)) if acronyms: acronyms_str = ["\nAcronyms:"] for acronym, expansions in acronyms.iteritems(): expansions_str = ", ".join(["%s (%d)" % expansion for expansion in expansions]) acronyms_str.append("%s %s" % (acronym, expansions_str)) acronyms_str = "\n".join(acronyms_str) else: acronyms_str = "\nNo acronyms." print keywords + acronyms_str + "\n" else: print keywords
def output_keywords_for_sources( input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Outputs the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print "Input file: %s" % source output = get_keywords_from_text( text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) print output # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url( entry, user_agent=make_user_agent_string("BibClassify")) if text_lines: source = entry.split("/")[-1] process_lines()