def is_pdf(document): """Checks if a document is a PDF file. Returns True if is is.""" if not executable_exists: write_message( "WARNING: GNU file was not found on the system. " "Switching to a weak file extension test.", stream=sys.stderr, verbose=2) if document.lower().endswith(".pdf"): return True return False # Tested with file version >= 4.10. First test is secure and works # with file version 4.25. Second condition is tested for file # version 4.10. file_output = os.popen('file ' + re.escape(document)).read() try: filetype = file_output.split(":")[1] except IndexError: write_message( "WARNING: Your version of the 'file' utility seems to " "be unsupported. Please report this to [email protected].", stream=sys.stderr, verbose=2) sys.exit(1) pdf = filetype.find("PDF") > -1 # This is how it should be done however this is incompatible with # file version 4.10. #os.popen('file -bi ' + document).read().find("application/pdf") return pdf
def output_keywords_for_local_file( local_file, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None): """Outputs the keywords for a local file.""" if verbose is not None: set_verbose_level(verbose) write_message("INFO: Analyzing keywords for local file %s." % local_file, stream=sys.stderr, verbose=3) text_lines = text_lines_from_local_file(local_file) return get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, taxonomy=taxonomy, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, no_cache=no_cache)
def text_lines_from_url(url, user_agent=""): """Returns the fulltext of the file found at the URL.""" request = urllib2.Request(url) if user_agent: request.add_header("User-Agent", user_agent) try: distant_stream = urllib2.urlopen(request) # Write the URL content to a temporary file. tmpfd, local_file = tempfile.mkstemp(prefix="bibclassify.") os.close(tmpfd) local_stream = open(local_file, "w") local_stream.write(distant_stream.read()) local_stream.close() except: write_message("ERROR: Unable to read from URL %s." % url, stream=sys.stderr, verbose=1) return None else: # Read lines from the temporary file. lines = text_lines_from_local_file(local_file, remote=True) os.remove(local_file) line_nb = len(lines) word_nb = 0 for line in lines: word_nb += len(re.findall("\S+", line)) write_message("INFO: Remote file has %d lines and %d words." % (line_nb, word_nb), stream=sys.stderr, verbose=3) return lines
def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False): """Returns a formatted string containing the keywords for a single document.""" global _SKWS global _CKWS if not _SKWS: if taxonomy is not None: _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) else: write_message("ERROR: Please specify an ontology in order to " "extract keywords.", stream=sys.stderr, verbose=1) text_lines = cut_references(text_lines) fulltext = normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = get_single_keywords(_SKWS, fulltext) composite_keywords = get_composite_keywords(_CKWS, fulltext, single_keywords) return _get_keywords_output(single_keywords, composite_keywords, taxonomy, author_keywords, output_mode, output_limit, spires, only_core_tags)
def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False, match_mode="full", no_cache=False, with_author_keywords=False): text_lines = text_lines_from_local_file(local_file) global _SKWS global _CKWS if not _SKWS: if taxonomy is not None: _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) else: write_message("ERROR: Please specify an ontology in order to " "extract keywords.", stream=sys.stderr, verbose=1) text_lines = cut_references(text_lines) fulltext = normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = get_single_keywords(_SKWS, fulltext) composite_keywords = get_composite_keywords(_CKWS, fulltext, single_keywords) return (single_keywords, composite_keywords)
def _download_remote_ontology(onto_url, time_difference=None): """Checks if the online ontology is more recent than the local ontology. If yes, try to download and store it in Invenio's cache directory. Return a boolean describing the success of the operation.""" if onto_url is None: return False dl_dir = ((CFG_CACHEDIR or tempfile.gettempdir()) + os.sep + "bibclassify" + os.sep) if not os.path.exists(dl_dir): os.mkdir(dl_dir) local_file = dl_dir + os.path.basename(onto_url) remote_modif_time = _get_last_modification_date(onto_url) try: local_modif_seconds = os.path.getmtime(local_file) except OSError: # The local file does not exist. Download the ontology. download = True write_message("INFO: The local ontology could not be found.", stream=sys.stderr, verbose=3) else: local_modif_time = datetime(*time.gmtime(local_modif_seconds)[0:6]) # Let's set a time delta of 1 hour and 10 minutes. time_difference = time_difference or timedelta(hours=1, minutes=10) download = remote_modif_time > local_modif_time + time_difference if download: write_message("INFO: The remote ontology '%s' is more recent " "than the local ontology." % onto_url, stream=sys.stderr, verbose=3) if download: return _download_ontology(onto_url, local_file) else: return False
def is_pdf(document): """Checks if a document is a PDF file. Returns True if is is.""" if not executable_exists: write_message("WARNING: GNU file was not found on the system. " "Switching to a weak file extension test.", stream=sys.stderr, verbose=2) if document.lower().endswith(".pdf"): return True return False # Tested with file version >= 4.10. First test is secure and works # with file version 4.25. Second condition is tested for file # version 4.10. file_output = os.popen('file ' + re.escape(document)).read() try: filetype = file_output.split(":")[1] except IndexError: write_message("WARNING: Your version of the 'file' utility seems to " "be unsupported. Please report this to [email protected].", stream=sys.stderr, verbose=2) sys.exit(1) pdf = filetype.find("PDF") > -1 # This is how it should be done however this is incompatible with # file version 4.10. #os.popen('file -bi ' + document).read().find("application/pdf") return pdf
def get_regular_expressions(ontology, rebuild=False, no_cache=False): """Returns a list of patterns compiled from the RDF/SKOS ontology. Uses cache if it exists and if the ontology hasn't changed.""" # Translate the ontology name into a local path. Check if the name # relates to an existing ontology. ontology_names = _get_ontology_path(ontology) if ontology_names is not None: onto_long_name, onto_url = ontology_names onto_path = os.path.join(CFG_CACHEDIR, 'bibclassify', onto_long_name) else: write_message("ERROR: Unable to understand the ontology name " "provided: '%s'." % ontology, stream=sys.stderr, verbose=0) sys.exit(0) # If a new remote ontology can be found, then download it. new_ontology = _download_remote_ontology(onto_url) if new_ontology: # A new ontology has been downloaded. Rebuild the cache. return _build_cache(onto_path, no_cache=no_cache) if os.access(onto_path, os.R_OK): # Can read from the ontology. if rebuild or no_cache: write_message("INFO: Cache generation is manually forced.", stream=sys.stderr, verbose=3) return _build_cache(onto_path, no_cache=no_cache) if os.access(_get_cache_path(onto_path), os.R_OK): if (os.path.getmtime(_get_cache_path(onto_path)) > os.path.getmtime(onto_path)): # Cache is more recent than the ontology: use cache. return _get_cache(onto_path) else: # Ontology is more recent than the cache: rebuild cache. if not no_cache: write_message("WARNING: The ontology '%s' has changed " "since the last cache generation." % ontology, stream=sys.stderr, verbose=2) return _build_cache(onto_path, no_cache=no_cache) else: # Cache does not exist. Build cache. return _build_cache(onto_path, no_cache=no_cache) else: if os.access(_get_cache_path(onto_path), os.R_OK): # ontology file not found. Use the cache instead. write_message("WARNING: The ontology couldn't be located. However " "a cached version of it is available. Using it as a " "reference.", stream=sys.stderr, verbose=2) return _get_cache(onto_path) else: # Cannot access the ontology nor the cache. Exit. write_message("ERROR: Neither the ontology file nor a cached " "version of it could be found.", stream=sys.stderr, verbose=0) sys.exit(0) return None
def _replace_greek_characters(line): """Replace greek characters in a string.""" for greek_char, replacement in _GREEK_REPLACEMENTS.iteritems(): try: line = line.replace(greek_char, replacement) except UnicodeDecodeError: write_message("WARNING: Unicode decoding error.", stream=sys.stderr, verbose=2) return "" return line
def _download_ontology(url, local_file): """Downloads the ontology and stores it in CFG_CACHEDIR.""" write_message("INFO: Copying remote ontology '%s' to file '%s'." % (url, local_file), stream=sys.stderr, verbose=3) try: url_desc = urllib2.urlopen(url) file_desc = open(local_file, 'w') file_desc.write(url_desc.read()) file_desc.close() except IOError, e: print e return False
def get_author_keywords(skw_db, ckw_db, fulltext): """Finds out human defined keyowrds in a text string. Searches for the string "Keywords:" and its declinations and matches the following words.""" timer_start = time.clock() split_string = CFG_BIBCLASSIFY_AUTHOR_KW_START.split(fulltext, 1) if len(split_string) == 1: write_message("INFO: Matching author keywords... No keywords found.", stream=sys.stderr, verbose=3) return None kw_string = split_string[1] for regex in CFG_BIBCLASSIFY_AUTHOR_KW_END: parts = regex.split(kw_string, 1) kw_string = parts[0] # We separate the keywords. author_keywords = CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION.split(kw_string) write_message("INFO: Matching author keywords... %d keywords found in " "%.1f sec." % (len(author_keywords), time.clock() - timer_start), stream=sys.stderr, verbose=3) out = {} for kw in author_keywords: # If the author keyword is an acronym with capital letters # separated by points, remove the points. if re.match('([A-Z].)+$', kw): kw = kw.replace('.', '') # First try with the keyword as such, then lower it. kw_with_spaces = ' %s ' % kw matching_skw = get_single_keywords(skw_db, kw_with_spaces, verbose=False) matching_ckw = get_composite_keywords(ckw_db, kw_with_spaces, matching_skw, verbose=False) if matching_skw or matching_ckw: out[kw] = (matching_skw, matching_ckw) continue lowkw = kw.lower() matching_skw = get_single_keywords(skw_db, ' %s ' % lowkw, verbose=False) matching_ckw = get_composite_keywords(ckw_db, ' %s ' % lowkw, matching_skw, verbose=False) out[kw] = (matching_skw, matching_ckw) return out
def __init__(self, store, namespace, subject): small_subject = subject.split("#Composite.")[-1] try: self.concept = store.value(subject, namespace["prefLabel"], any=True) except KeyError: # Keyword has no prefLabel. We can discard that error. write_message("WARNING: Keyword with subject %s has no prefLabel" % small_subject, stream=sys.stderr, verbose=2) component_positions = [] for label in store.objects(subject, namespace["compositeOf"]): strlabel = str(label).split("#")[-1] component_name = label.split("#")[-1] component_positions.append( (small_subject.find(component_name), strlabel)) self.compositeof = [] component_positions.sort() try: for position in component_positions: self.compositeof.append( single_keywords_by_subject[position[1]]) except KeyError: # One single keyword is not present in the taxonomy. This # is due to an error in the taxonomy description. self.compositeof = [] self.core = False for note in map(lambda s: str(s).lower().strip(), store.objects(subject, namespace["note"])): if note == 'core': self.core = True self.spires = store.value(subject, namespace["spiresLabel"], any=True) if self.spires is not None: self.spires = self.spires self.regex = [] for label in store.objects(subject, namespace["altLabel"]): pattern = _get_regex_pattern(label) self.regex.append(re.compile(CFG_BIBCLASSIFY_WORD_WRAP % pattern)) self.fieldcodes = [] for code in store.objects(subject, namespace["field"]): self.fieldcodes.append(str(code))
def cut_references(text_lines): """Returns the text lines with the references cut.""" ref_sect_start = find_reference_section(text_lines) if ref_sect_start is not None: start = ref_sect_start["start_line"] end = find_end_of_reference_section(text_lines, start, ref_sect_start["marker"], ref_sect_start["marker_pattern"]) del text_lines[start:end + 1] else: write_message("WARNING: No references could be found.", stream=sys.stderr, verbose=2) return text_lines return text_lines
def text_lines_from_local_file(document, remote=False): """Returns the fulltext of the local file.""" try: if is_pdf(document): if not executable_exists("pdftotext"): write_message("ERROR: pdftotext is not available on the " "system.", stream=sys.stderr, verbose=1) cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document) filestream = os.popen(cmd) else: filestream = open(document, "r") except IOError, ex1: write_message("ERROR: Unable to read from file %s. (%s)" % (document, ex1.strerror), stream=sys.stderr, verbose=1) return None
def __init__(self, store, namespace, subject): small_subject = subject.split("#Composite.")[-1] try: self.concept = store.value(subject, namespace["prefLabel"], any=True) except KeyError: # Keyword has no prefLabel. We can discard that error. write_message("WARNING: Keyword with subject %s has no prefLabel" % small_subject, stream=sys.stderr, verbose=2) component_positions = [] for label in store.objects(subject, namespace["compositeOf"]): strlabel = str(label).split("#")[-1] component_name = label.split("#")[-1] component_positions.append((small_subject.find(component_name), strlabel)) self.compositeof = [] component_positions.sort() try: for position in component_positions: self.compositeof.append(single_keywords_by_subject[position[1]]) except KeyError: # One single keyword is not present in the taxonomy. This # is due to an error in the taxonomy description. self.compositeof = [] self.core = False for note in map(lambda s: str(s).lower().strip(), store.objects(subject, namespace["note"])): if note == 'core': self.core = True self.spires = store.value(subject, namespace["spiresLabel"], any=True) if self.spires is not None: self.spires = self.spires self.regex = [] for label in store.objects(subject, namespace["altLabel"]): pattern = _get_regex_pattern(label) self.regex.append(re.compile(CFG_BIBCLASSIFY_WORD_WRAP % pattern)) self.fieldcodes = [] for code in store.objects(subject, namespace["field"]): self.fieldcodes.append(str(code))
def get_single_keywords(skw_db, fulltext, verbose=True): """Returns a dictionary of single keywords bound with the positions of the matches in the fulltext. Format of the output dictionary is (single keyword: positions).""" timer_start = time.clock() # Matched span -> single keyword records = [] for single_keyword in skw_db: for regex in single_keyword.regex: for match in regex.finditer(fulltext): # Modify the right index to put it on the last letter # of the word. span = (match.span()[0], match.span()[1] - 1) # Remove the previous records contained by this span records = [ record for record in records if not _contains_span(span, record[0]) ] add = True for previous_record in records: if ((span, single_keyword) == previous_record or _contains_span(previous_record[0], span)): # Match is contained by a previous match. add = False break if add: records.append((span, single_keyword)) # List of single_keywords: {spans: single keyword} single_keywords = {} for span, single_keyword in records: single_keywords.setdefault(single_keyword, []).append(span) if verbose: write_message("INFO: Matching single keywords... %d keywords found " "in %.1f sec." % (len(single_keywords), time.clock() - timer_start), stream=sys.stderr, verbose=3) return single_keywords
def _get_cache_path(source_file): """Returns the file name of the cached taxonomy.""" global _cache_location relative_dir = "bibclassify" cache_name = os.path.basename(source_file) + ".db" if _cache_location is not None: # The location of the cache has been previously found. return _cache_location else: # Find the most probable location of the cache. First consider # Invenio's temp directory then the system temp directory. if os.access(CFG_CACHEDIR, os.W_OK): tmp_dir = CFG_CACHEDIR else: tmp_dir = tempfile.gettempdir() absolute_dir = os.path.join(tmp_dir, relative_dir) # Test bibclassify's directory in the temp directory. if not os.path.exists(absolute_dir): try: os.mkdir(absolute_dir) except: write_message("WARNING: Impossible to write in the temp " "directory %s." % tmp_dir, stream=sys.stderr, verbose=2) _cache_location = "" return _cache_location # At that time, the bibclassify's directory should exist. Test if it's # readable and writable. if os.access(absolute_dir, os.R_OK) and os.access( absolute_dir, os.W_OK): _cache_location = os.path.join(absolute_dir, cache_name) return _cache_location else: write_message( "WARNING: Cache directory does exist but is not " "accessible. Check your permissions.", stream=sys.stderr, verbose=2) _cache_location = "" return _cache_location
def get_single_keywords(skw_db, fulltext, verbose=True): """Returns a dictionary of single keywords bound with the positions of the matches in the fulltext. Format of the output dictionary is (single keyword: positions).""" timer_start = time.clock() # Matched span -> single keyword records = [] for single_keyword in skw_db: for regex in single_keyword.regex: for match in regex.finditer(fulltext): # Modify the right index to put it on the last letter # of the word. span = (match.span()[0], match.span()[1] - 1) # Remove the previous records contained by this span records = [record for record in records if not _contains_span(span, record[0])] add = True for previous_record in records: if (span, single_keyword) == previous_record or _contains_span(previous_record[0], span): # Match is contained by a previous match. add = False break if add: records.append((span, single_keyword)) # List of single_keywords: {spans: single keyword} single_keywords = {} for span, single_keyword in records: single_keywords.setdefault(single_keyword, []).append(span) if verbose: write_message( "INFO: Matching single keywords... %d keywords found " "in %.1f sec." % (len(single_keywords), time.clock() - timer_start), stream=sys.stderr, verbose=3, ) return single_keywords
def _get_cache_path(source_file): """Returns the file name of the cached taxonomy.""" global _cache_location relative_dir = "bibclassify" cache_name = os.path.basename(source_file) + ".db" if _cache_location is not None: # The location of the cache has been previously found. return _cache_location else: # Find the most probable location of the cache. First consider # Invenio's temp directory then the system temp directory. if os.access(CFG_CACHEDIR, os.W_OK): tmp_dir = CFG_CACHEDIR else: tmp_dir = tempfile.gettempdir() absolute_dir = os.path.join(tmp_dir, relative_dir) # Test bibclassify's directory in the temp directory. if not os.path.exists(absolute_dir): try: os.mkdir(absolute_dir) except: write_message("WARNING: Impossible to write in the temp " "directory %s." % tmp_dir, stream=sys.stderr, verbose=2) _cache_location = "" return _cache_location # At that time, the bibclassify's directory should exist. Test if it's # readable and writable. if os.access(absolute_dir, os.R_OK) and os.access(absolute_dir, os.W_OK): _cache_location = os.path.join(absolute_dir, cache_name) return _cache_location else: write_message("WARNING: Cache directory does exist but is not " "accessible. Check your permissions.", stream=sys.stderr, verbose=2) _cache_location = "" return _cache_location
def text_lines_from_local_file(document, remote=False): """Returns the fulltext of the local file.""" try: if is_pdf(document): if not executable_exists("pdftotext"): write_message( "ERROR: pdftotext is not available on the " "system.", stream=sys.stderr, verbose=1) cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document) filestream = os.popen(cmd) else: filestream = open(document, "r") except IOError, ex1: write_message("ERROR: Unable to read from file %s. (%s)" % (document, ex1.strerror), stream=sys.stderr, verbose=1) return None
def output_keywords_for_local_file(local_file, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None): """Outputs the keywords for a local file.""" if verbose is not None: set_verbose_level(verbose) write_message("INFO: Analyzing keywords for local file %s." % local_file, stream=sys.stderr, verbose=3) text_lines = text_lines_from_local_file(local_file) return get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, taxonomy=taxonomy, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, no_cache=no_cache)
def _get_cache(source_file): """Get the cached taxonomy using the cPickle module. No check is done at that stage.""" timer_start = time.clock() cache_file = _get_cache_path(source_file) filestream = open(cache_file, "r") try: cached_data = cPickle.load(filestream) except (cPickle.UnpicklingError, AttributeError, DeprecationWarning): write_message("WARNING: The existing cache in %s is not readable. " "Rebuilding it." % cache_file, stream=sys.stderr, verbose=3) filestream.close() os.remove(cache_file) return _build_cache(source_file) filestream.close() single_keywords = cached_data["single"] composite_keywords = cached_data["composite"] write_message("INFO: Found ontology cache created on %s." % time.asctime(cached_data["creation_time"]), stream=sys.stderr, verbose=3) write_message("INFO: Retrieved cache... %d terms read in %.1f sec." % (len(single_keywords) + len(composite_keywords), time.clock() - timer_start), stream=sys.stderr, verbose=3) return (single_keywords, composite_keywords)
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None, only_core_tags=False, extract_acronyms=False): """Outputs the keywords for each source in sources.""" if verbose is not None: set_verbose_level(verbose) # Initialize cache global _SKWS global _CKWS _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) # Get the fulltext for each source. for entry in input_sources: write_message("INFO: Trying input file %s." % entry, stream=sys.stderr, verbose=3) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if os.path.isfile(entry + filename): text_lines = text_lines_from_local_file(entry + filename) if text_lines: source = filename elif os.path.isfile(entry): text_lines = text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) else: # Treat as a URL. text_lines = text_lines_from_url(entry, user_agent=CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] if source: if output_mode == "text": print "Input file: %s" % source keywords = get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, only_core_tags=only_core_tags) if extract_acronyms: acronyms = get_acronyms("\n".join(text_lines)) if acronyms: acronyms_str = ["\nAcronyms:"] for acronym, expansions in acronyms.iteritems(): expansions_str = ", ".join(["%s (%d)" % expansion for expansion in expansions]) acronyms_str.append("%s %s" % (acronym, expansions_str)) acronyms_str = "\n".join(acronyms_str) else: acronyms_str = "\nNo acronyms." print keywords + acronyms_str + "\n" else: print keywords
def get_composite_keywords(ckw_db, fulltext, skw_spans, verbose=True): """Returns a list of composite keywords bound with the number of occurrences found in the text string. Format of the output list is (composite keyword, count, component counts).""" timer_start = time.clock() # Build the list of composite candidates ckw_list = [] skw_as_components = [] for composite_keyword in ckw_db: # Counters for the composite keyword. First count is for the # number of occurrences in the whole document and second count # is for the human defined keywords. ckw_count = 0 matched_spans = [] # Check the alternative labels. for regex in composite_keyword.regex: for match in regex.finditer(fulltext): span = list(match.span()) span[1] -= 1 span = tuple(span) if not span in matched_spans: ckw_count += 1 matched_spans.append(span) # Get the single keywords locations. try: components = composite_keyword.compositeof except AttributeError: print >> sys.stderr, ("Cached ontology is corrupted. Please " "remove the cached ontology in your temporary file.") sys.exit(1) try: spans = [skw_spans[component] for component in components] except KeyError: # The keyword components are not to be found in the text. # This is not a dramatic exception and we can safely ignore # it. pass else: ckw_spans = [] for index in range(len(spans) - 1): if ckw_spans: previous_spans = ckw_spans else: previous_spans = spans[index] ckw_spans = [] for new_span in [(span0, span1) for span0 in previous_spans for span1 in spans[index + 1]]: span = _get_ckw_span(fulltext, new_span) if span is not None: ckw_spans.append(span) for span in [span for span in ckw_spans if not span in matched_spans]: ckw_count += 1 matched_spans.append(span) if ckw_count: # Gather the component counts. component_counts = [] for component in components: skw_as_components.append(component) # Get the single keyword count. try: component_counts.append(len(skw_spans[component])) except KeyError: component_counts.append(0) # Store the composite keyword ckw_list.append((composite_keyword, ckw_count, component_counts)) # Remove the single keywords that appear as components from the list # of single keywords. for skw in skw_as_components: try: del skw_spans[skw] except KeyError: pass if verbose: write_message("INFO: Matching composite keywords... %d keywords found " "in %.1f sec." % (len(ckw_list), time.clock() - timer_start), stream=sys.stderr, verbose=3) return ckw_list
return datetime(*(parsed)[0:6]) def _download_ontology(url, local_file): """Downloads the ontology and stores it in CFG_CACHEDIR.""" write_message("INFO: Copying remote ontology '%s' to file '%s'." % (url, local_file), stream=sys.stderr, verbose=3) try: url_desc = urllib2.urlopen(url) file_desc = open(local_file, 'w') file_desc.write(url_desc.read()) file_desc.close() except IOError, e: print e return False except: write_message("WARNING: Unable to download the ontology. '%s'" % sys.exc_info()[0], stream=sys.stderr, verbose=2) return False else: write_message("INFO: Done copying.", stream=sys.stderr, verbose=3) return True def _get_searchable_regex(basic=None, hidden=None): """Returns the searchable regular expressions for the single keyword.""" # Hidden labels are used to store regular expressions. basic = basic or [] hidden = hidden or [] hidden_regex_dict = {} for hidden_label in hidden: if _is_regex(hidden_label):
def check_taxonomy(taxonomy): """Checks the consistency of the taxonomy and outputs a list of errors and warnings.""" write_message("INFO: Building graph with Python RDFLib version %s" % rdflib.__version__, stream=sys.stdout, verbose=0) if rdflib.__version__ >= '2.3.2': store = rdflib.ConjunctiveGraph() else: store = rdflib.Graph() try: store.parse(taxonomy) except: write_message( "ERROR: The taxonomy is not a valid RDF file. Are you " "trying to check a controlled vocabulary?", stream=sys.stdout, verbose=0) sys.exit(0) write_message("INFO: Graph was successfully built.", stream=sys.stdout, verbose=0) prefLabel = "prefLabel" hiddenLabel = "hiddenLabel" altLabel = "altLabel" composite = "composite" compositeOf = "compositeOf" note = "note" both_skw_and_ckw = [] # Build a dictionary we will reason on later. uniq_subjects = {} for subject in store.subjects(): uniq_subjects[subject] = None subjects = {} for subject in uniq_subjects: strsubject = str(subject).split("#Composite.")[-1] strsubject = strsubject.split("#")[-1] if (strsubject == "http://cern.ch/thesauri/HEPontology.rdf" or strsubject == "compositeOf"): continue components = {} for predicate, value in store.predicate_objects(subject): strpredicate = str(predicate).split("#")[-1] strobject = str(value).split("#Composite.")[-1] strobject = strobject.split("#")[-1] components.setdefault(strpredicate, []).append(strobject) if strsubject in subjects: both_skw_and_ckw.append(strsubject) else: subjects[strsubject] = components write_message("INFO: Taxonomy contains %s concepts." % len(subjects), stream=sys.stdout, verbose=0) no_prefLabel = [] multiple_prefLabels = [] bad_notes = [] # Subjects with no composite or compositeOf predicate lonely = [] both_composites = [] bad_hidden_labels = {} bad_alt_labels = {} # Problems with composite keywords composite_problem1 = [] composite_problem2 = [] composite_problem3 = [] composite_problem4 = {} composite_problem5 = [] composite_problem6 = [] stemming_collisions = [] interconcept_collisions = {} for subject, predicates in subjects.iteritems(): # No prefLabel or multiple prefLabels try: if len(predicates[prefLabel]) > 1: multiple_prefLabels.append(subject) except KeyError: no_prefLabel.append(subject) # Lonely and both composites. if not composite in predicates and not compositeOf in predicates: lonely.append(subject) elif composite in predicates and compositeOf in predicates: both_composites.append(subject) # Multiple or bad notes if note in predicates: bad_notes += [(subject, n) for n in predicates[note] if n not in ('nostandalone', 'core')] # Bad hidden labels if hiddenLabel in predicates: for lbl in predicates[hiddenLabel]: if lbl.startswith("/") ^ lbl.endswith("/"): bad_hidden_labels.setdefault(subject, []).append(lbl) # Bad alt labels if altLabel in predicates: for lbl in predicates[altLabel]: if len(re.findall("/", lbl)) >= 2 or ":" in lbl: bad_alt_labels.setdefault(subject, []).append(lbl) # Check composite if composite in predicates: for ckw in predicates[composite]: if ckw in subjects: if compositeOf in subjects[ckw]: if not subject in subjects[ckw][compositeOf]: composite_problem3.append((subject, ckw)) else: if not ckw in both_skw_and_ckw: composite_problem2.append((subject, ckw)) else: composite_problem1.append((subject, ckw)) # Check compositeOf if compositeOf in predicates: for skw in predicates[compositeOf]: if skw in subjects: if composite in subjects[skw]: if not subject in subjects[skw][composite]: composite_problem6.append((subject, skw)) else: if not skw in both_skw_and_ckw: composite_problem5.append((subject, skw)) else: composite_problem4.setdefault(skw, []).append(subject) # Check for stemmed labels if compositeOf in predicates: labels = (altLabel, hiddenLabel) else: labels = (prefLabel, altLabel, hiddenLabel) patterns = {} for label in [lbl for lbl in labels if lbl in predicates]: for expression in [ expr for expr in predicates[label] if not _is_regex(expr) ]: pattern = _get_regex_pattern(expression) interconcept_collisions.setdefault(pattern, []).append( (subject, label)) if pattern in patterns: stemming_collisions.append( (subject, patterns[pattern], (label, expression))) else: patterns[pattern] = (label, expression) print "\n==== ERRORS ====" if no_prefLabel: print "\nConcepts with no prefLabel: %d" % len(no_prefLabel) print "\n".join([" %s" % subj for subj in no_prefLabel]) if multiple_prefLabels: print("\nConcepts with multiple prefLabels: %d" % len(multiple_prefLabels)) print "\n".join([" %s" % subj for subj in multiple_prefLabels]) if both_composites: print("\nConcepts with both composite properties: %d" % len(both_composites)) print "\n".join([" %s" % subj for subj in both_composites]) if bad_hidden_labels: print "\nConcepts with bad hidden labels: %d" % len(bad_hidden_labels) for kw, lbls in bad_hidden_labels.iteritems(): print " %s:" % kw print "\n".join([" '%s'" % lbl for lbl in lbls]) if bad_alt_labels: print "\nConcepts with bad alt labels: %d" % len(bad_alt_labels) for kw, lbls in bad_alt_labels.iteritems(): print " %s:" % kw print "\n".join([" '%s'" % lbl for lbl in lbls]) if both_skw_and_ckw: print("\nKeywords that are both skw and ckw: %d" % len(both_skw_and_ckw)) print "\n".join([" %s" % subj for subj in both_skw_and_ckw]) print if composite_problem1: print "\n".join([ "SKW '%s' references an unexisting CKW '%s'." % (skw, ckw) for skw, ckw in composite_problem1 ]) if composite_problem2: print "\n".join([ "SKW '%s' references a SKW '%s'." % (skw, ckw) for skw, ckw in composite_problem2 ]) if composite_problem3: print "\n".join([ "SKW '%s' is not composite of CKW '%s'." % (skw, ckw) for skw, ckw in composite_problem3 ]) if composite_problem4: for skw, ckws in composite_problem4.iteritems(): print "SKW '%s' does not exist but is " "referenced by:" % skw print "\n".join([" %s" % ckw for ckw in ckws]) if composite_problem5: print "\n".join([ "CKW '%s' references a CKW '%s'." % kw for kw in composite_problem5 ]) if composite_problem6: print "\n".join([ "CKW '%s' is not composed by SKW '%s'." % kw for kw in composite_problem6 ]) print "\n==== WARNINGS ====" if bad_notes: print("\nConcepts with bad notes: %d" % len(bad_notes)) print "\n".join([" '%s': '%s'" % note for note in bad_notes]) if stemming_collisions: print( "\nFollowing keywords have unnecessary labels that have " "already been generated by BibClassify.") for subj in stemming_collisions: print " %s:\n %s\n and %s" % subj print "\nFinished." sys.exit(0)
Compares 2 composite keywords matches (composite_keyword, spans, components). First compare the occurrences, then the length of the word, at last the component counts. """ count_comparison = cmp(ckw1_match[1], ckw0_match[1]) if count_comparison: return count_comparison component_avg0 = sum(ckw0_match[2]) / len(ckw0_match[2]) component_avg1 = sum(ckw1_match[2]) / len(ckw1_match[2]) component_comparison = cmp(component_avg1, component_avg0) if component_comparison: return component_comparison else: return cmp(len(ckw1_match[0].concept), len(ckw0_match[0].concept)) def _get_sorted_skw_matches(skw_matches, limit=20): """Returns a resized version of data structures of keywords to the given length.""" sorted_keywords = list(skw_matches.items()) sorted_keywords.sort(_skw_matches_comparator) return limit and sorted_keywords[:limit] or sorted_keywords def _resize_ckw_matches(keywords, limit=20): """Returns a resized version of the composite_keywords list.""" keywords.sort(_ckw_matches_comparator) return limit and keywords[:limit] or keywords if __name__ == "__main__": write_message("ERROR: Please use bibclassify_cli from now on.", stream=sys.stderr, verbose=0)
def _build_cache(source_file, no_cache=False): """Builds the cached data by parsing the RDF taxonomy file or a vocabulary file.""" if rdflib.__version__ >= '2.3.2': store = rdflib.ConjunctiveGraph() else: store = rdflib.Graph() timer_start = time.clock() global single_keywords_by_subject global composite_keywords_by_subject single_keywords, composite_keywords = [], [] try: write_message("INFO: Building RDFLib's conjunctive graph.", stream=sys.stderr, verbose=3) store.parse(source_file) except: # File is not a RDF file. We assume it is a controlled vocabulary. write_message("INFO: The ontology file is not a valid RDF file. " "Assuming it is a controlled vocabulary file.", stream=sys.stderr, verbose=3) filestream = open(source_file, "r") for line in filestream: keyword = line.strip() single_keywords.append(SingleKeyword(keyword)) else: write_message("INFO: Building cache from RDF file %s." % source_file, stream=sys.stderr, verbose=3) # File is a RDF file. namespace = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#") single_count = 0 composite_count = 0 for subject_object in store.subject_objects(namespace["prefLabel"]): # Keep only the single keywords. # FIXME: Remove or alter that condition in order to allow using # other ontologies that do not have this composite notion (such # as NASA-subjects.rdf) if not store.value(subject_object[0], namespace["compositeOf"], any=True): skw = SingleKeyword(subject_object[0], store=store, namespace=namespace) single_keywords.append(skw) subject = str(subject_object[0]).split("#")[-1] single_keywords_by_subject[subject] = skw single_count += 1 # Let's go through the composite keywords. for subject, pref_label in \ store.subject_objects(namespace["prefLabel"]): # Keep only the single keywords. if store.value(subject, namespace["compositeOf"], any=True): strsubject = str(subject).split("#")[-1] composite_keywords.append(CompositeKeyword(store, namespace, subject)) composite_count += 1 store.close() cached_data = {} cached_data["single"] = single_keywords cached_data["composite"] = composite_keywords cached_data["creation_time"] = time.gmtime() write_message("INFO: Building taxonomy... %d terms built in %.1f sec." % (len(single_keywords) + len(composite_keywords), time.clock() - timer_start), stream=sys.stderr, verbose=3) if not no_cache: # Serialize. try: filestream = open(_get_cache_path(source_file), "w") except IOError: # Impossible to write the cache. write_message("ERROR: Impossible to write cache to %s." % _get_cache_path(source_file), stream=sys.stderr, verbose=1) return (single_keywords, composite_keywords) else: write_message("INFO: Writing cache to file %s." % _get_cache_path(source_file), stream=sys.stderr, verbose=3) cPickle.dump(cached_data, filestream, 1) filestream.close() return (single_keywords, composite_keywords)
"system.", stream=sys.stderr, verbose=1) cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document) filestream = os.popen(cmd) else: filestream = open(document, "r") except IOError, ex1: write_message("ERROR: Unable to read from file %s. (%s)" % (document, ex1.strerror), stream=sys.stderr, verbose=1) return None lines = [line.decode("utf-8") for line in filestream] filestream.close() if not _is_english_text('\n'.join(lines)): write_message("WARNING: It seems the file '%s' is unvalid and doesn't " "contain text. Please communicate this file to the Invenio " "team." % document, stream=sys.stderr, verbose=0) line_nb = len(lines) word_nb = 0 for line in lines: word_nb += len(re.findall("\S+", line)) # Discard lines that do not contain at least one word. lines = [line for line in lines if _ONE_WORD.search(line) is not None] if not remote: write_message("INFO: Local file has %d lines and %d words." % (line_nb, word_nb), stream=sys.stderr, verbose=3) return lines
def _build_cache(source_file, no_cache=False): """Builds the cached data by parsing the RDF taxonomy file or a vocabulary file.""" if rdflib.__version__ >= '2.3.2': store = rdflib.ConjunctiveGraph() else: store = rdflib.Graph() timer_start = time.clock() global single_keywords_by_subject global composite_keywords_by_subject single_keywords, composite_keywords = [], [] try: write_message("INFO: Building RDFLib's conjunctive graph.", stream=sys.stderr, verbose=3) store.parse(source_file) except: # File is not a RDF file. We assume it is a controlled vocabulary. write_message( "INFO: The ontology file is not a valid RDF file. " "Assuming it is a controlled vocabulary file.", stream=sys.stderr, verbose=3) filestream = open(source_file, "r") for line in filestream: keyword = line.strip() single_keywords.append(SingleKeyword(keyword)) else: write_message("INFO: Building cache from RDF file %s." % source_file, stream=sys.stderr, verbose=3) # File is a RDF file. namespace = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#") single_count = 0 composite_count = 0 for subject_object in store.subject_objects(namespace["prefLabel"]): # Keep only the single keywords. # FIXME: Remove or alter that condition in order to allow using # other ontologies that do not have this composite notion (such # as NASA-subjects.rdf) if not store.value( subject_object[0], namespace["compositeOf"], any=True): skw = SingleKeyword(subject_object[0], store=store, namespace=namespace) single_keywords.append(skw) subject = str(subject_object[0]).split("#")[-1] single_keywords_by_subject[subject] = skw single_count += 1 # Let's go through the composite keywords. for subject, pref_label in \ store.subject_objects(namespace["prefLabel"]): # Keep only the single keywords. if store.value(subject, namespace["compositeOf"], any=True): strsubject = str(subject).split("#")[-1] composite_keywords.append( CompositeKeyword(store, namespace, subject)) composite_count += 1 store.close() cached_data = {} cached_data["single"] = single_keywords cached_data["composite"] = composite_keywords cached_data["creation_time"] = time.gmtime() write_message("INFO: Building taxonomy... %d terms built in %.1f sec." % (len(single_keywords) + len(composite_keywords), time.clock() - timer_start), stream=sys.stderr, verbose=3) if not no_cache: # Serialize. try: filestream = open(_get_cache_path(source_file), "w") except IOError: # Impossible to write the cache. write_message("ERROR: Impossible to write cache to %s." % _get_cache_path(source_file), stream=sys.stderr, verbose=1) return (single_keywords, composite_keywords) else: write_message("INFO: Writing cache to file %s." % _get_cache_path(source_file), stream=sys.stderr, verbose=3) cPickle.dump(cached_data, filestream, 1) filestream.close() return (single_keywords, composite_keywords)
def check_taxonomy(taxonomy): """Checks the consistency of the taxonomy and outputs a list of errors and warnings.""" write_message("INFO: Building graph with Python RDFLib version %s" % rdflib.__version__, stream=sys.stdout, verbose=0) if rdflib.__version__ >= '2.3.2': store = rdflib.ConjunctiveGraph() else: store = rdflib.Graph() try: store.parse(taxonomy) except: write_message("ERROR: The taxonomy is not a valid RDF file. Are you " "trying to check a controlled vocabulary?", stream=sys.stdout, verbose=0) sys.exit(0) write_message("INFO: Graph was successfully built.", stream=sys.stdout, verbose=0) prefLabel = "prefLabel" hiddenLabel = "hiddenLabel" altLabel = "altLabel" composite = "composite" compositeOf = "compositeOf" note = "note" both_skw_and_ckw = [] # Build a dictionary we will reason on later. uniq_subjects = {} for subject in store.subjects(): uniq_subjects[subject] = None subjects = {} for subject in uniq_subjects: strsubject = str(subject).split("#Composite.")[-1] strsubject = strsubject.split("#")[-1] if (strsubject == "http://cern.ch/thesauri/HEPontology.rdf" or strsubject == "compositeOf"): continue components = {} for predicate, value in store.predicate_objects(subject): strpredicate = str(predicate).split("#")[-1] strobject = str(value).split("#Composite.")[-1] strobject = strobject.split("#")[-1] components.setdefault(strpredicate, []).append(strobject) if strsubject in subjects: both_skw_and_ckw.append(strsubject) else: subjects[strsubject] = components write_message("INFO: Taxonomy contains %s concepts." % len(subjects), stream=sys.stdout, verbose=0) no_prefLabel = [] multiple_prefLabels = [] bad_notes = [] # Subjects with no composite or compositeOf predicate lonely = [] both_composites = [] bad_hidden_labels = {} bad_alt_labels = {} # Problems with composite keywords composite_problem1 = [] composite_problem2 = [] composite_problem3 = [] composite_problem4 = {} composite_problem5 = [] composite_problem6 = [] stemming_collisions = [] interconcept_collisions = {} for subject, predicates in subjects.iteritems(): # No prefLabel or multiple prefLabels try: if len(predicates[prefLabel]) > 1: multiple_prefLabels.append(subject) except KeyError: no_prefLabel.append(subject) # Lonely and both composites. if not composite in predicates and not compositeOf in predicates: lonely.append(subject) elif composite in predicates and compositeOf in predicates: both_composites.append(subject) # Multiple or bad notes if note in predicates: bad_notes += [(subject, n) for n in predicates[note] if n not in ('nostandalone', 'core')] # Bad hidden labels if hiddenLabel in predicates: for lbl in predicates[hiddenLabel]: if lbl.startswith("/") ^ lbl.endswith("/"): bad_hidden_labels.setdefault(subject, []).append(lbl) # Bad alt labels if altLabel in predicates: for lbl in predicates[altLabel]: if len(re.findall("/", lbl)) >= 2 or ":" in lbl: bad_alt_labels.setdefault(subject, []).append(lbl) # Check composite if composite in predicates: for ckw in predicates[composite]: if ckw in subjects: if compositeOf in subjects[ckw]: if not subject in subjects[ckw][compositeOf]: composite_problem3.append((subject, ckw)) else: if not ckw in both_skw_and_ckw: composite_problem2.append((subject, ckw)) else: composite_problem1.append((subject, ckw)) # Check compositeOf if compositeOf in predicates: for skw in predicates[compositeOf]: if skw in subjects: if composite in subjects[skw]: if not subject in subjects[skw][composite]: composite_problem6.append((subject, skw)) else: if not skw in both_skw_and_ckw: composite_problem5.append((subject, skw)) else: composite_problem4.setdefault(skw, []).append(subject) # Check for stemmed labels if compositeOf in predicates: labels = (altLabel, hiddenLabel) else: labels = (prefLabel, altLabel, hiddenLabel) patterns = {} for label in [lbl for lbl in labels if lbl in predicates]: for expression in [expr for expr in predicates[label] if not _is_regex(expr)]: pattern = _get_regex_pattern(expression) interconcept_collisions.setdefault(pattern, []).append((subject, label)) if pattern in patterns: stemming_collisions.append((subject, patterns[pattern], (label, expression) )) else: patterns[pattern] = (label, expression) print "\n==== ERRORS ====" if no_prefLabel: print "\nConcepts with no prefLabel: %d" % len(no_prefLabel) print "\n".join([" %s" % subj for subj in no_prefLabel]) if multiple_prefLabels: print ("\nConcepts with multiple prefLabels: %d" % len(multiple_prefLabels)) print "\n".join([" %s" % subj for subj in multiple_prefLabels]) if both_composites: print ("\nConcepts with both composite properties: %d" % len(both_composites)) print "\n".join([" %s" % subj for subj in both_composites]) if bad_hidden_labels: print "\nConcepts with bad hidden labels: %d" % len(bad_hidden_labels) for kw, lbls in bad_hidden_labels.iteritems(): print " %s:" % kw print "\n".join([" '%s'" % lbl for lbl in lbls]) if bad_alt_labels: print "\nConcepts with bad alt labels: %d" % len(bad_alt_labels) for kw, lbls in bad_alt_labels.iteritems(): print " %s:" % kw print "\n".join([" '%s'" % lbl for lbl in lbls]) if both_skw_and_ckw: print ("\nKeywords that are both skw and ckw: %d" % len(both_skw_and_ckw)) print "\n".join([" %s" % subj for subj in both_skw_and_ckw]) print if composite_problem1: print "\n".join(["SKW '%s' references an unexisting CKW '%s'." % (skw, ckw) for skw, ckw in composite_problem1]) if composite_problem2: print "\n".join(["SKW '%s' references a SKW '%s'." % (skw, ckw) for skw, ckw in composite_problem2]) if composite_problem3: print "\n".join(["SKW '%s' is not composite of CKW '%s'." % (skw, ckw) for skw, ckw in composite_problem3]) if composite_problem4: for skw, ckws in composite_problem4.iteritems(): print "SKW '%s' does not exist but is " "referenced by:" % skw print "\n".join([" %s" % ckw for ckw in ckws]) if composite_problem5: print "\n".join(["CKW '%s' references a CKW '%s'." % kw for kw in composite_problem5]) if composite_problem6: print "\n".join(["CKW '%s' is not composed by SKW '%s'." % kw for kw in composite_problem6]) print "\n==== WARNINGS ====" if bad_notes: print ("\nConcepts with bad notes: %d" % len(bad_notes)) print "\n".join([" '%s': '%s'" % note for note in bad_notes]) if stemming_collisions: print ("\nFollowing keywords have unnecessary labels that have " "already been generated by BibClassify.") for subj in stemming_collisions: print " %s:\n %s\n and %s" % subj print "\nFinished." sys.exit(0)
else: filestream = open(document, "r") except IOError, ex1: write_message("ERROR: Unable to read from file %s. (%s)" % (document, ex1.strerror), stream=sys.stderr, verbose=1) return None lines = [line.decode("utf-8") for line in filestream] filestream.close() if not _is_english_text('\n'.join(lines)): write_message( "WARNING: It seems the file '%s' is unvalid and doesn't " "contain text. Please communicate this file to the Invenio " "team." % document, stream=sys.stderr, verbose=0) line_nb = len(lines) word_nb = 0 for line in lines: word_nb += len(re.findall("\S+", line)) # Discard lines that do not contain at least one word. lines = [line for line in lines if _ONE_WORD.search(line) is not None] if not remote: write_message("INFO: Local file has %d lines and %d words." % (line_nb, word_nb), stream=sys.stderr,
def get_composite_keywords(ckw_db, fulltext, skw_spans, verbose=True): """Returns a list of composite keywords bound with the number of occurrences found in the text string. Format of the output list is (composite keyword, count, component counts).""" timer_start = time.clock() # Build the list of composite candidates ckw_list = [] skw_as_components = [] for composite_keyword in ckw_db: # Counters for the composite keyword. First count is for the # number of occurrences in the whole document and second count # is for the human defined keywords. ckw_count = 0 matched_spans = [] # Check the alternative labels. for regex in composite_keyword.regex: for match in regex.finditer(fulltext): span = list(match.span()) span[1] -= 1 span = tuple(span) if not span in matched_spans: ckw_count += 1 matched_spans.append(span) # Get the single keywords locations. try: components = composite_keyword.compositeof except AttributeError: print >> sys.stderr, ( "Cached ontology is corrupted. Please " "remove the cached ontology in your temporary file.") sys.exit(1) try: spans = [skw_spans[component] for component in components] except KeyError: # The keyword components are not to be found in the text. # This is not a dramatic exception and we can safely ignore # it. pass else: ckw_spans = [] for index in range(len(spans) - 1): if ckw_spans: previous_spans = ckw_spans else: previous_spans = spans[index] ckw_spans = [] for new_span in [(span0, span1) for span0 in previous_spans for span1 in spans[index + 1]]: span = _get_ckw_span(fulltext, new_span) if span is not None: ckw_spans.append(span) for span in [ span for span in ckw_spans if not span in matched_spans ]: ckw_count += 1 matched_spans.append(span) if ckw_count: # Gather the component counts. component_counts = [] for component in components: skw_as_components.append(component) # Get the single keyword count. try: component_counts.append(len(skw_spans[component])) except KeyError: component_counts.append(0) # Store the composite keyword ckw_list.append((composite_keyword, ckw_count, component_counts)) # Remove the single keywords that appear as components from the list # of single keywords. for skw in skw_as_components: try: del skw_spans[skw] except KeyError: pass if verbose: write_message("INFO: Matching composite keywords... %d keywords found " "in %.1f sec." % (len(ckw_list), time.clock() - timer_start), stream=sys.stderr, verbose=3) return ckw_list
"""Downloads the ontology and stores it in CFG_CACHEDIR.""" write_message("INFO: Copying remote ontology '%s' to file '%s'." % (url, local_file), stream=sys.stderr, verbose=3) try: url_desc = urllib2.urlopen(url) file_desc = open(local_file, 'w') file_desc.write(url_desc.read()) file_desc.close() except IOError, e: print e return False except: write_message("WARNING: Unable to download the ontology. '%s'" % sys.exc_info()[0], stream=sys.stderr, verbose=2) return False else: write_message("INFO: Done copying.", stream=sys.stderr, verbose=3) return True def _get_searchable_regex(basic=None, hidden=None): """Returns the searchable regular expressions for the single keyword.""" # Hidden labels are used to store regular expressions. basic = basic or [] hidden = hidden or [] hidden_regex_dict = {}