def is_pdf(document):
    """Checks if a document is a PDF file. Returns True if is is."""
    if not executable_exists:
        write_message(
            "WARNING: GNU file was not found on the system. "
            "Switching to a weak file extension test.",
            stream=sys.stderr,
            verbose=2)
        if document.lower().endswith(".pdf"):
            return True
        return False
    # Tested with file version >= 4.10. First test is secure and works
    # with file version 4.25. Second condition is tested for file
    # version 4.10.
    file_output = os.popen('file ' + re.escape(document)).read()
    try:
        filetype = file_output.split(":")[1]
    except IndexError:
        write_message(
            "WARNING: Your version of the 'file' utility seems to "
            "be unsupported. Please report this to [email protected].",
            stream=sys.stderr,
            verbose=2)
        sys.exit(1)

    pdf = filetype.find("PDF") > -1
    # This is how it should be done however this is incompatible with
    # file version 4.10.
    #os.popen('file -bi ' + document).read().find("application/pdf")
    return pdf
def output_keywords_for_local_file(
        local_file,
        taxonomy,
        rebuild_cache=False,
        output_mode="text",
        output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        spires=False,
        verbose=None):
    """Outputs the keywords for a local file."""
    if verbose is not None:
        set_verbose_level(verbose)

    write_message("INFO: Analyzing keywords for local file %s." % local_file,
                  stream=sys.stderr,
                  verbose=3)
    text_lines = text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
                                  output_mode=output_mode,
                                  output_limit=output_limit,
                                  taxonomy=taxonomy,
                                  spires=spires,
                                  match_mode=match_mode,
                                  with_author_keywords=with_author_keywords,
                                  rebuild_cache=rebuild_cache,
                                  no_cache=no_cache)
def text_lines_from_url(url, user_agent=""):
    """Returns the fulltext of the file found at the URL."""
    request = urllib2.Request(url)
    if user_agent:
        request.add_header("User-Agent", user_agent)
    try:
        distant_stream = urllib2.urlopen(request)
        # Write the URL content to a temporary file.
        tmpfd, local_file = tempfile.mkstemp(prefix="bibclassify.")
        os.close(tmpfd)
        local_stream = open(local_file, "w")
        local_stream.write(distant_stream.read())
        local_stream.close()
    except:
        write_message("ERROR: Unable to read from URL %s." % url,
                      stream=sys.stderr,
                      verbose=1)
        return None
    else:
        # Read lines from the temporary file.
        lines = text_lines_from_local_file(local_file, remote=True)
        os.remove(local_file)

        line_nb = len(lines)
        word_nb = 0
        for line in lines:
            word_nb += len(re.findall("\S+", line))

        write_message("INFO: Remote file has %d lines and %d words." %
                      (line_nb, word_nb),
                      stream=sys.stderr,
                      verbose=3)

        return lines
def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text",
    output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False):
    """Returns a formatted string containing the keywords for a single
    document."""
    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return _get_keywords_output(single_keywords, composite_keywords, taxonomy,
        author_keywords, output_mode, output_limit, spires, only_core_tags)
def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)
def _download_remote_ontology(onto_url, time_difference=None):
    """Checks if the online ontology is more recent than the local ontology. If
    yes, try to download and store it in Invenio's cache directory. Return a
    boolean describing the success of the operation."""
    if onto_url is None:
        return False

    dl_dir = ((CFG_CACHEDIR or tempfile.gettempdir()) + os.sep +
        "bibclassify" + os.sep)
    if not os.path.exists(dl_dir):
        os.mkdir(dl_dir)

    local_file = dl_dir + os.path.basename(onto_url)
    remote_modif_time = _get_last_modification_date(onto_url)
    try:
        local_modif_seconds = os.path.getmtime(local_file)
    except OSError:
        # The local file does not exist. Download the ontology.
        download = True
        write_message("INFO: The local ontology could not be found.",
            stream=sys.stderr, verbose=3)
    else:
        local_modif_time = datetime(*time.gmtime(local_modif_seconds)[0:6])
        # Let's set a time delta of 1 hour and 10 minutes.
        time_difference = time_difference or timedelta(hours=1, minutes=10)
        download = remote_modif_time > local_modif_time + time_difference
        if download:
            write_message("INFO: The remote ontology '%s' is more recent "
                "than the local ontology." % onto_url, stream=sys.stderr,
                verbose=3)

    if download:
        return _download_ontology(onto_url, local_file)
    else:
        return False
def text_lines_from_url(url, user_agent=""):
    """Returns the fulltext of the file found at the URL."""
    request = urllib2.Request(url)
    if user_agent:
        request.add_header("User-Agent", user_agent)
    try:
        distant_stream = urllib2.urlopen(request)
        # Write the URL content to a temporary file.
        tmpfd, local_file = tempfile.mkstemp(prefix="bibclassify.")
        os.close(tmpfd)
        local_stream = open(local_file, "w")
        local_stream.write(distant_stream.read())
        local_stream.close()
    except:
        write_message("ERROR: Unable to read from URL %s." % url,
            stream=sys.stderr, verbose=1)
        return None
    else:
        # Read lines from the temporary file.
        lines = text_lines_from_local_file(local_file, remote=True)
        os.remove(local_file)

        line_nb = len(lines)
        word_nb = 0
        for line in lines:
            word_nb += len(re.findall("\S+", line))

        write_message("INFO: Remote file has %d lines and %d words." %
            (line_nb, word_nb), stream=sys.stderr, verbose=3)

        return lines
Пример #8
0
def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)
Пример #9
0
def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text",
    output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False):
    """Returns a formatted string containing the keywords for a single
    document."""
    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return _get_keywords_output(single_keywords, composite_keywords, taxonomy,
        author_keywords, output_mode, output_limit, spires, only_core_tags)
def is_pdf(document):
    """Checks if a document is a PDF file. Returns True if is is."""
    if not executable_exists:
        write_message("WARNING: GNU file was not found on the system. "
            "Switching to a weak file extension test.", stream=sys.stderr,
            verbose=2)
        if document.lower().endswith(".pdf"):
            return True
        return False
    # Tested with file version >= 4.10. First test is secure and works
    # with file version 4.25. Second condition is tested for file
    # version 4.10.
    file_output = os.popen('file ' + re.escape(document)).read()
    try:
        filetype = file_output.split(":")[1]
    except IndexError:
        write_message("WARNING: Your version of the 'file' utility seems to "
            "be unsupported. Please report this to [email protected].",
            stream=sys.stderr, verbose=2)
        sys.exit(1)

    pdf = filetype.find("PDF") > -1
    # This is how it should be done however this is incompatible with
    # file version 4.10.
    #os.popen('file -bi ' + document).read().find("application/pdf")
    return pdf
def get_regular_expressions(ontology, rebuild=False, no_cache=False):
    """Returns a list of patterns compiled from the RDF/SKOS ontology.
    Uses cache if it exists and if the ontology hasn't changed."""

    # Translate the ontology name into a local path. Check if the name
    # relates to an existing ontology.
    ontology_names = _get_ontology_path(ontology)
    if ontology_names is not None:
        onto_long_name, onto_url = ontology_names
        onto_path = os.path.join(CFG_CACHEDIR, 'bibclassify', onto_long_name)
    else:
        write_message("ERROR: Unable to understand the ontology name "
            "provided: '%s'." % ontology, stream=sys.stderr, verbose=0)
        sys.exit(0)

    # If a new remote ontology can be found, then download it.
    new_ontology = _download_remote_ontology(onto_url)

    if new_ontology:
        # A new ontology has been downloaded. Rebuild the cache.
        return _build_cache(onto_path, no_cache=no_cache)

    if os.access(onto_path, os.R_OK):
        # Can read from the ontology.
        if rebuild or no_cache:
            write_message("INFO: Cache generation is manually forced.",
                stream=sys.stderr, verbose=3)
            return _build_cache(onto_path, no_cache=no_cache)

        if os.access(_get_cache_path(onto_path), os.R_OK):
            if (os.path.getmtime(_get_cache_path(onto_path)) >
                os.path.getmtime(onto_path)):
                # Cache is more recent than the ontology: use cache.
                return _get_cache(onto_path)
            else:
                # Ontology is more recent than the cache: rebuild cache.
                if not no_cache:
                    write_message("WARNING: The ontology '%s' has changed "
                        "since the last cache generation." % ontology,
                        stream=sys.stderr, verbose=2)
                return _build_cache(onto_path, no_cache=no_cache)
        else:
            # Cache does not exist. Build cache.
            return _build_cache(onto_path, no_cache=no_cache)
    else:
        if os.access(_get_cache_path(onto_path), os.R_OK):
            # ontology file not found. Use the cache instead.
            write_message("WARNING: The ontology couldn't be located. However "
                "a cached version of it is available. Using it as a "
                "reference.", stream=sys.stderr, verbose=2)
            return _get_cache(onto_path)
        else:
            # Cannot access the ontology nor the cache. Exit.
            write_message("ERROR: Neither the ontology file nor a cached "
                "version of it could be found.", stream=sys.stderr, verbose=0)
            sys.exit(0)
            return None
def _replace_greek_characters(line):
    """Replace greek characters in a string."""
    for greek_char, replacement in _GREEK_REPLACEMENTS.iteritems():
        try:
            line = line.replace(greek_char, replacement)
        except UnicodeDecodeError:
            write_message("WARNING: Unicode decoding error.",
                stream=sys.stderr, verbose=2)
            return ""

    return line
def _replace_greek_characters(line):
    """Replace greek characters in a string."""
    for greek_char, replacement in _GREEK_REPLACEMENTS.iteritems():
        try:
            line = line.replace(greek_char, replacement)
        except UnicodeDecodeError:
            write_message("WARNING: Unicode decoding error.",
                          stream=sys.stderr,
                          verbose=2)
            return ""

    return line
def _download_ontology(url, local_file):
    """Downloads the ontology and stores it in CFG_CACHEDIR."""
    write_message("INFO: Copying remote ontology '%s' to file '%s'." % (url,
        local_file), stream=sys.stderr, verbose=3)
    try:
        url_desc = urllib2.urlopen(url)
        file_desc = open(local_file, 'w')
        file_desc.write(url_desc.read())
        file_desc.close()
    except IOError, e:
        print e
        return False
def get_author_keywords(skw_db, ckw_db, fulltext):
    """Finds out human defined keyowrds in a text string. Searches for
    the string "Keywords:" and its declinations and matches the
    following words."""
    timer_start = time.clock()

    split_string = CFG_BIBCLASSIFY_AUTHOR_KW_START.split(fulltext, 1)
    if len(split_string) == 1:
        write_message("INFO: Matching author keywords... No keywords found.",
        stream=sys.stderr, verbose=3)
        return None

    kw_string = split_string[1]

    for regex in CFG_BIBCLASSIFY_AUTHOR_KW_END:
        parts = regex.split(kw_string, 1)
        kw_string = parts[0]

    # We separate the keywords.
    author_keywords = CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION.split(kw_string)

    write_message("INFO: Matching author keywords... %d keywords found in "
        "%.1f sec." % (len(author_keywords), time.clock() - timer_start),
        stream=sys.stderr, verbose=3)

    out = {}
    for kw in author_keywords:
        # If the author keyword is an acronym with capital letters
        # separated by points, remove the points.
        if re.match('([A-Z].)+$', kw):
            kw = kw.replace('.', '')

        # First try with the keyword as such, then lower it.
        kw_with_spaces = ' %s ' % kw
        matching_skw = get_single_keywords(skw_db, kw_with_spaces,
            verbose=False)
        matching_ckw = get_composite_keywords(ckw_db, kw_with_spaces,
            matching_skw, verbose=False)

        if matching_skw or matching_ckw:
            out[kw] = (matching_skw, matching_ckw)
            continue

        lowkw = kw.lower()

        matching_skw = get_single_keywords(skw_db, ' %s ' % lowkw, verbose=False)
        matching_ckw = get_composite_keywords(ckw_db, ' %s ' % lowkw,
            matching_skw, verbose=False)

        out[kw] = (matching_skw, matching_ckw)

    return out
    def __init__(self, store, namespace, subject):
        small_subject = subject.split("#Composite.")[-1]

        try:
            self.concept = store.value(subject,
                                       namespace["prefLabel"],
                                       any=True)
        except KeyError:
            # Keyword has no prefLabel. We can discard that error.
            write_message("WARNING: Keyword with subject %s has no prefLabel" %
                          small_subject,
                          stream=sys.stderr,
                          verbose=2)

        component_positions = []
        for label in store.objects(subject, namespace["compositeOf"]):
            strlabel = str(label).split("#")[-1]
            component_name = label.split("#")[-1]
            component_positions.append(
                (small_subject.find(component_name), strlabel))

        self.compositeof = []
        component_positions.sort()
        try:
            for position in component_positions:
                self.compositeof.append(
                    single_keywords_by_subject[position[1]])
        except KeyError:
            # One single keyword is not present in the taxonomy. This
            # is due to an error in the taxonomy description.
            self.compositeof = []

        self.core = False
        for note in map(lambda s: str(s).lower().strip(),
                        store.objects(subject, namespace["note"])):
            if note == 'core':
                self.core = True

        self.spires = store.value(subject, namespace["spiresLabel"], any=True)
        if self.spires is not None:
            self.spires = self.spires

        self.regex = []
        for label in store.objects(subject, namespace["altLabel"]):
            pattern = _get_regex_pattern(label)
            self.regex.append(re.compile(CFG_BIBCLASSIFY_WORD_WRAP % pattern))

        self.fieldcodes = []
        for code in store.objects(subject, namespace["field"]):
            self.fieldcodes.append(str(code))
def cut_references(text_lines):
    """Returns the text lines with the references cut."""
    ref_sect_start = find_reference_section(text_lines)
    if ref_sect_start is not None:
        start = ref_sect_start["start_line"]
        end = find_end_of_reference_section(text_lines, start,
            ref_sect_start["marker"], ref_sect_start["marker_pattern"])
        del text_lines[start:end + 1]
    else:
        write_message("WARNING: No references could be found.",
            stream=sys.stderr, verbose=2)
        return text_lines

    return text_lines
def text_lines_from_local_file(document, remote=False):
    """Returns the fulltext of the local file."""
    try:
        if is_pdf(document):
            if not executable_exists("pdftotext"):
                write_message("ERROR: pdftotext is not available on the "
                    "system.", stream=sys.stderr, verbose=1)
            cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document)
            filestream = os.popen(cmd)
        else:
            filestream = open(document, "r")
    except IOError, ex1:
        write_message("ERROR: Unable to read from file %s. (%s)" % (document,
            ex1.strerror), stream=sys.stderr, verbose=1)
        return None
def cut_references(text_lines):
    """Returns the text lines with the references cut."""
    ref_sect_start = find_reference_section(text_lines)
    if ref_sect_start is not None:
        start = ref_sect_start["start_line"]
        end = find_end_of_reference_section(text_lines, start,
                                            ref_sect_start["marker"],
                                            ref_sect_start["marker_pattern"])
        del text_lines[start:end + 1]
    else:
        write_message("WARNING: No references could be found.",
                      stream=sys.stderr,
                      verbose=2)
        return text_lines

    return text_lines
    def __init__(self, store, namespace, subject):
        small_subject = subject.split("#Composite.")[-1]

        try:
            self.concept = store.value(subject, namespace["prefLabel"],
                any=True)
        except KeyError:
            # Keyword has no prefLabel. We can discard that error.
            write_message("WARNING: Keyword with subject %s has no prefLabel" %
                small_subject, stream=sys.stderr, verbose=2)

        component_positions = []
        for label in store.objects(subject, namespace["compositeOf"]):
            strlabel = str(label).split("#")[-1]
            component_name = label.split("#")[-1]
            component_positions.append((small_subject.find(component_name),
                strlabel))

        self.compositeof = []
        component_positions.sort()
        try:
            for position in component_positions:
                self.compositeof.append(single_keywords_by_subject[position[1]])
        except KeyError:
            # One single keyword is not present in the taxonomy. This
            # is due to an error in the taxonomy description.
            self.compositeof = []

        self.core = False
        for note in map(lambda s: str(s).lower().strip(),
            store.objects(subject, namespace["note"])):
            if note == 'core':
                self.core = True

        self.spires = store.value(subject, namespace["spiresLabel"], any=True)
        if self.spires is not None:
            self.spires = self.spires

        self.regex = []
        for label in store.objects(subject, namespace["altLabel"]):
            pattern = _get_regex_pattern(label)
            self.regex.append(re.compile(CFG_BIBCLASSIFY_WORD_WRAP % pattern))

        self.fieldcodes = []
        for code in store.objects(subject, namespace["field"]):
            self.fieldcodes.append(str(code))
def get_single_keywords(skw_db, fulltext, verbose=True):
    """Returns a dictionary of single keywords bound with the positions
    of the matches in the fulltext.
    Format of the output dictionary is (single keyword: positions)."""
    timer_start = time.clock()

    # Matched span -> single keyword
    records = []

    for single_keyword in skw_db:
        for regex in single_keyword.regex:
            for match in regex.finditer(fulltext):
                # Modify the right index to put it on the last letter
                # of the word.
                span = (match.span()[0], match.span()[1] - 1)

                # Remove the previous records contained by this span
                records = [
                    record for record in records
                    if not _contains_span(span, record[0])
                ]

                add = True
                for previous_record in records:
                    if ((span, single_keyword) == previous_record
                            or _contains_span(previous_record[0], span)):
                        # Match is contained by a previous match.
                        add = False
                        break

                if add:
                    records.append((span, single_keyword))

    # List of single_keywords: {spans: single keyword}
    single_keywords = {}
    for span, single_keyword in records:
        single_keywords.setdefault(single_keyword, []).append(span)

    if verbose:
        write_message("INFO: Matching single keywords... %d keywords found "
                      "in %.1f sec." %
                      (len(single_keywords), time.clock() - timer_start),
                      stream=sys.stderr,
                      verbose=3)

    return single_keywords
def _get_cache_path(source_file):
    """Returns the file name of the cached taxonomy."""
    global _cache_location

    relative_dir = "bibclassify"
    cache_name = os.path.basename(source_file) + ".db"

    if _cache_location is not None:
        # The location of the cache has been previously found.
        return _cache_location
    else:
        # Find the most probable location of the cache. First consider
        # Invenio's temp directory then the system temp directory.
        if os.access(CFG_CACHEDIR, os.W_OK):
            tmp_dir = CFG_CACHEDIR
        else:
            tmp_dir = tempfile.gettempdir()

        absolute_dir = os.path.join(tmp_dir, relative_dir)
        # Test bibclassify's directory in the temp directory.
        if not os.path.exists(absolute_dir):
            try:
                os.mkdir(absolute_dir)
            except:
                write_message("WARNING: Impossible to write in the temp "
                              "directory %s." % tmp_dir,
                              stream=sys.stderr,
                              verbose=2)
                _cache_location = ""
                return _cache_location

        # At that time, the bibclassify's directory should exist. Test if it's
        # readable and writable.
        if os.access(absolute_dir, os.R_OK) and os.access(
                absolute_dir, os.W_OK):
            _cache_location = os.path.join(absolute_dir, cache_name)
            return _cache_location
        else:
            write_message(
                "WARNING: Cache directory does exist but is not "
                "accessible. Check your permissions.",
                stream=sys.stderr,
                verbose=2)
            _cache_location = ""
            return _cache_location
def get_single_keywords(skw_db, fulltext, verbose=True):
    """Returns a dictionary of single keywords bound with the positions
    of the matches in the fulltext.
    Format of the output dictionary is (single keyword: positions)."""
    timer_start = time.clock()

    # Matched span -> single keyword
    records = []

    for single_keyword in skw_db:
        for regex in single_keyword.regex:
            for match in regex.finditer(fulltext):
                # Modify the right index to put it on the last letter
                # of the word.
                span = (match.span()[0], match.span()[1] - 1)

                # Remove the previous records contained by this span
                records = [record for record in records if not _contains_span(span, record[0])]

                add = True
                for previous_record in records:
                    if (span, single_keyword) == previous_record or _contains_span(previous_record[0], span):
                        # Match is contained by a previous match.
                        add = False
                        break

                if add:
                    records.append((span, single_keyword))

    # List of single_keywords: {spans: single keyword}
    single_keywords = {}
    for span, single_keyword in records:
        single_keywords.setdefault(single_keyword, []).append(span)

    if verbose:
        write_message(
            "INFO: Matching single keywords... %d keywords found "
            "in %.1f sec." % (len(single_keywords), time.clock() - timer_start),
            stream=sys.stderr,
            verbose=3,
        )

    return single_keywords
def _get_cache_path(source_file):
    """Returns the file name of the cached taxonomy."""
    global _cache_location

    relative_dir = "bibclassify"
    cache_name = os.path.basename(source_file) + ".db"

    if _cache_location is not None:
        # The location of the cache has been previously found.
        return _cache_location
    else:
        # Find the most probable location of the cache. First consider
        # Invenio's temp directory then the system temp directory.
        if os.access(CFG_CACHEDIR, os.W_OK):
            tmp_dir = CFG_CACHEDIR
        else:
            tmp_dir = tempfile.gettempdir()

        absolute_dir = os.path.join(tmp_dir, relative_dir)
        # Test bibclassify's directory in the temp directory.
        if not os.path.exists(absolute_dir):
            try:
                os.mkdir(absolute_dir)
            except:
                write_message("WARNING: Impossible to write in the temp "
                    "directory %s." % tmp_dir, stream=sys.stderr,
                    verbose=2)
                _cache_location = ""
                return _cache_location

        # At that time, the bibclassify's directory should exist. Test if it's
        # readable and writable.
        if os.access(absolute_dir, os.R_OK) and os.access(absolute_dir,
            os.W_OK):
            _cache_location = os.path.join(absolute_dir, cache_name)
            return _cache_location
        else:
            write_message("WARNING: Cache directory does exist but is not "
                "accessible. Check your permissions.", stream=sys.stderr,
                verbose=2)
            _cache_location = ""
            return _cache_location
def text_lines_from_local_file(document, remote=False):
    """Returns the fulltext of the local file."""
    try:
        if is_pdf(document):
            if not executable_exists("pdftotext"):
                write_message(
                    "ERROR: pdftotext is not available on the "
                    "system.",
                    stream=sys.stderr,
                    verbose=1)
            cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document)
            filestream = os.popen(cmd)
        else:
            filestream = open(document, "r")
    except IOError, ex1:
        write_message("ERROR: Unable to read from file %s. (%s)" %
                      (document, ex1.strerror),
                      stream=sys.stderr,
                      verbose=1)
        return None
Пример #26
0
def output_keywords_for_local_file(local_file, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None):
    """Outputs the keywords for a local file."""
    if verbose is not None:
        set_verbose_level(verbose)

    write_message("INFO: Analyzing keywords for local file %s." % local_file,
        stream=sys.stderr, verbose=3)
    text_lines = text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
        output_mode=output_mode,
        output_limit=output_limit,
        taxonomy=taxonomy,
        spires=spires,
        match_mode=match_mode,
        with_author_keywords=with_author_keywords,
        rebuild_cache=rebuild_cache,
        no_cache=no_cache)
def _get_cache(source_file):
    """Get the cached taxonomy using the cPickle module. No check is done at
    that stage."""
    timer_start = time.clock()

    cache_file = _get_cache_path(source_file)
    filestream = open(cache_file, "r")
    try:
        cached_data = cPickle.load(filestream)
    except (cPickle.UnpicklingError, AttributeError, DeprecationWarning):
        write_message("WARNING: The existing cache in %s is not readable. "
                      "Rebuilding it." % cache_file,
                      stream=sys.stderr,
                      verbose=3)
        filestream.close()
        os.remove(cache_file)
        return _build_cache(source_file)
    filestream.close()

    single_keywords = cached_data["single"]
    composite_keywords = cached_data["composite"]

    write_message("INFO: Found ontology cache created on %s." %
                  time.asctime(cached_data["creation_time"]),
                  stream=sys.stderr,
                  verbose=3)

    write_message("INFO: Retrieved cache... %d terms read in %.1f sec." %
                  (len(single_keywords) + len(composite_keywords),
                   time.clock() - timer_start),
                  stream=sys.stderr,
                  verbose=3)

    return (single_keywords, composite_keywords)
def _get_cache(source_file):
    """Get the cached taxonomy using the cPickle module. No check is done at
    that stage."""
    timer_start = time.clock()

    cache_file = _get_cache_path(source_file)
    filestream = open(cache_file, "r")
    try:
        cached_data = cPickle.load(filestream)
    except (cPickle.UnpicklingError, AttributeError, DeprecationWarning):
        write_message("WARNING: The existing cache in %s is not readable. "
            "Rebuilding it." %
            cache_file, stream=sys.stderr, verbose=3)
        filestream.close()
        os.remove(cache_file)
        return _build_cache(source_file)
    filestream.close()

    single_keywords = cached_data["single"]
    composite_keywords = cached_data["composite"]

    write_message("INFO: Found ontology cache created on %s." %
        time.asctime(cached_data["creation_time"]), stream=sys.stderr,
        verbose=3)

    write_message("INFO: Retrieved cache... %d terms read in %.1f sec." %
        (len(single_keywords) + len(composite_keywords),
        time.clock() - timer_start), stream=sys.stderr, verbose=3)

    return (single_keywords, composite_keywords)
Пример #29
0
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords
def get_composite_keywords(ckw_db, fulltext, skw_spans, verbose=True):
    """Returns a list of composite keywords bound with the number of
    occurrences found in the text string.
    Format of the output list is (composite keyword, count, component counts)."""
    timer_start = time.clock()

    # Build the list of composite candidates
    ckw_list = []
    skw_as_components = []

    for composite_keyword in ckw_db:
        # Counters for the composite keyword. First count is for the
        # number of occurrences in the whole document and second count
        # is for the human defined keywords.
        ckw_count = 0
        matched_spans = []

        # Check the alternative labels.
        for regex in composite_keyword.regex:
            for match in regex.finditer(fulltext):
                span = list(match.span())
                span[1] -= 1
                span = tuple(span)
                if not span in matched_spans:
                    ckw_count += 1
                    matched_spans.append(span)

        # Get the single keywords locations.
        try:
            components = composite_keyword.compositeof
        except AttributeError:
            print >> sys.stderr, ("Cached ontology is corrupted. Please "
                "remove the cached ontology in your temporary file.")
            sys.exit(1)
        try:
            spans = [skw_spans[component] for component in components]
        except KeyError:
            # The keyword components are not to be found in the text.
            # This is not a dramatic exception and we can safely ignore
            # it.
            pass
        else:
            ckw_spans = []
            for index in range(len(spans) - 1):
                if ckw_spans:
                    previous_spans = ckw_spans
                else:
                    previous_spans = spans[index]

                ckw_spans = []
                for new_span in [(span0, span1) for span0 in previous_spans
                                                for span1 in spans[index + 1]]:
                    span = _get_ckw_span(fulltext, new_span)
                    if span is not None:
                        ckw_spans.append(span)

            for span in [span for span in ckw_spans
                              if not span in matched_spans]:
                ckw_count += 1
                matched_spans.append(span)

        if ckw_count:
            # Gather the component counts.
            component_counts = []
            for component in components:
                skw_as_components.append(component)
                # Get the single keyword count.
                try:
                    component_counts.append(len(skw_spans[component]))
                except KeyError:
                    component_counts.append(0)

            # Store the composite keyword
            ckw_list.append((composite_keyword, ckw_count, component_counts))

    # Remove the single keywords that appear as components from the list
    # of single keywords.
    for skw in skw_as_components:
        try:
            del skw_spans[skw]
        except KeyError:
            pass

    if verbose:
        write_message("INFO: Matching composite keywords... %d keywords found "
            "in %.1f sec." % (len(ckw_list), time.clock() - timer_start),
            stream=sys.stderr, verbose=3)

    return ckw_list
    return datetime(*(parsed)[0:6])

def _download_ontology(url, local_file):
    """Downloads the ontology and stores it in CFG_CACHEDIR."""
    write_message("INFO: Copying remote ontology '%s' to file '%s'." % (url,
        local_file), stream=sys.stderr, verbose=3)
    try:
        url_desc = urllib2.urlopen(url)
        file_desc = open(local_file, 'w')
        file_desc.write(url_desc.read())
        file_desc.close()
    except IOError, e:
        print e
        return False
    except:
        write_message("WARNING: Unable to download the ontology. '%s'" %
            sys.exc_info()[0], stream=sys.stderr, verbose=2)
        return False
    else:
        write_message("INFO: Done copying.", stream=sys.stderr, verbose=3)
        return True

def _get_searchable_regex(basic=None, hidden=None):
    """Returns the searchable regular expressions for the single
    keyword."""
    # Hidden labels are used to store regular expressions.
    basic = basic or []
    hidden = hidden or []

    hidden_regex_dict = {}
    for hidden_label in hidden:
        if _is_regex(hidden_label):
def check_taxonomy(taxonomy):
    """Checks the consistency of the taxonomy and outputs a list of
    errors and warnings."""
    write_message("INFO: Building graph with Python RDFLib version %s" %
                  rdflib.__version__,
                  stream=sys.stdout,
                  verbose=0)

    if rdflib.__version__ >= '2.3.2':
        store = rdflib.ConjunctiveGraph()
    else:
        store = rdflib.Graph()

    try:
        store.parse(taxonomy)
    except:
        write_message(
            "ERROR: The taxonomy is not a valid RDF file. Are you "
            "trying to check a controlled vocabulary?",
            stream=sys.stdout,
            verbose=0)
        sys.exit(0)

    write_message("INFO: Graph was successfully built.",
                  stream=sys.stdout,
                  verbose=0)

    prefLabel = "prefLabel"
    hiddenLabel = "hiddenLabel"
    altLabel = "altLabel"
    composite = "composite"
    compositeOf = "compositeOf"
    note = "note"

    both_skw_and_ckw = []

    # Build a dictionary we will reason on later.
    uniq_subjects = {}
    for subject in store.subjects():
        uniq_subjects[subject] = None

    subjects = {}
    for subject in uniq_subjects:
        strsubject = str(subject).split("#Composite.")[-1]
        strsubject = strsubject.split("#")[-1]
        if (strsubject == "http://cern.ch/thesauri/HEPontology.rdf"
                or strsubject == "compositeOf"):
            continue
        components = {}
        for predicate, value in store.predicate_objects(subject):
            strpredicate = str(predicate).split("#")[-1]
            strobject = str(value).split("#Composite.")[-1]
            strobject = strobject.split("#")[-1]
            components.setdefault(strpredicate, []).append(strobject)
        if strsubject in subjects:
            both_skw_and_ckw.append(strsubject)
        else:
            subjects[strsubject] = components

    write_message("INFO: Taxonomy contains %s concepts." % len(subjects),
                  stream=sys.stdout,
                  verbose=0)

    no_prefLabel = []
    multiple_prefLabels = []
    bad_notes = []
    # Subjects with no composite or compositeOf predicate
    lonely = []
    both_composites = []
    bad_hidden_labels = {}
    bad_alt_labels = {}
    # Problems with composite keywords
    composite_problem1 = []
    composite_problem2 = []
    composite_problem3 = []
    composite_problem4 = {}
    composite_problem5 = []
    composite_problem6 = []

    stemming_collisions = []
    interconcept_collisions = {}

    for subject, predicates in subjects.iteritems():
        # No prefLabel or multiple prefLabels
        try:
            if len(predicates[prefLabel]) > 1:
                multiple_prefLabels.append(subject)
        except KeyError:
            no_prefLabel.append(subject)

        # Lonely and both composites.
        if not composite in predicates and not compositeOf in predicates:
            lonely.append(subject)
        elif composite in predicates and compositeOf in predicates:
            both_composites.append(subject)

        # Multiple or bad notes
        if note in predicates:
            bad_notes += [(subject, n) for n in predicates[note]
                          if n not in ('nostandalone', 'core')]

        # Bad hidden labels
        if hiddenLabel in predicates:
            for lbl in predicates[hiddenLabel]:
                if lbl.startswith("/") ^ lbl.endswith("/"):
                    bad_hidden_labels.setdefault(subject, []).append(lbl)

        # Bad alt labels
        if altLabel in predicates:
            for lbl in predicates[altLabel]:
                if len(re.findall("/", lbl)) >= 2 or ":" in lbl:
                    bad_alt_labels.setdefault(subject, []).append(lbl)

        # Check composite
        if composite in predicates:
            for ckw in predicates[composite]:
                if ckw in subjects:
                    if compositeOf in subjects[ckw]:
                        if not subject in subjects[ckw][compositeOf]:
                            composite_problem3.append((subject, ckw))
                    else:
                        if not ckw in both_skw_and_ckw:
                            composite_problem2.append((subject, ckw))
                else:
                    composite_problem1.append((subject, ckw))

        # Check compositeOf
        if compositeOf in predicates:
            for skw in predicates[compositeOf]:
                if skw in subjects:
                    if composite in subjects[skw]:
                        if not subject in subjects[skw][composite]:
                            composite_problem6.append((subject, skw))
                    else:
                        if not skw in both_skw_and_ckw:
                            composite_problem5.append((subject, skw))
                else:
                    composite_problem4.setdefault(skw, []).append(subject)

        # Check for stemmed labels
        if compositeOf in predicates:
            labels = (altLabel, hiddenLabel)
        else:
            labels = (prefLabel, altLabel, hiddenLabel)

        patterns = {}
        for label in [lbl for lbl in labels if lbl in predicates]:
            for expression in [
                    expr for expr in predicates[label] if not _is_regex(expr)
            ]:
                pattern = _get_regex_pattern(expression)
                interconcept_collisions.setdefault(pattern, []).append(
                    (subject, label))
                if pattern in patterns:
                    stemming_collisions.append(
                        (subject, patterns[pattern], (label, expression)))
                else:
                    patterns[pattern] = (label, expression)

    print "\n==== ERRORS ===="

    if no_prefLabel:
        print "\nConcepts with no prefLabel: %d" % len(no_prefLabel)
        print "\n".join(["   %s" % subj for subj in no_prefLabel])
    if multiple_prefLabels:
        print("\nConcepts with multiple prefLabels: %d" %
              len(multiple_prefLabels))
        print "\n".join(["   %s" % subj for subj in multiple_prefLabels])
    if both_composites:
        print("\nConcepts with both composite properties: %d" %
              len(both_composites))
        print "\n".join(["   %s" % subj for subj in both_composites])
    if bad_hidden_labels:
        print "\nConcepts with bad hidden labels: %d" % len(bad_hidden_labels)
        for kw, lbls in bad_hidden_labels.iteritems():
            print "   %s:" % kw
            print "\n".join(["      '%s'" % lbl for lbl in lbls])
    if bad_alt_labels:
        print "\nConcepts with bad alt labels: %d" % len(bad_alt_labels)
        for kw, lbls in bad_alt_labels.iteritems():
            print "   %s:" % kw
            print "\n".join(["      '%s'" % lbl for lbl in lbls])
    if both_skw_and_ckw:
        print("\nKeywords that are both skw and ckw: %d" %
              len(both_skw_and_ckw))
        print "\n".join(["   %s" % subj for subj in both_skw_and_ckw])

    print

    if composite_problem1:
        print "\n".join([
            "SKW '%s' references an unexisting CKW '%s'." % (skw, ckw)
            for skw, ckw in composite_problem1
        ])
    if composite_problem2:
        print "\n".join([
            "SKW '%s' references a SKW '%s'." % (skw, ckw)
            for skw, ckw in composite_problem2
        ])
    if composite_problem3:
        print "\n".join([
            "SKW '%s' is not composite of CKW '%s'." % (skw, ckw)
            for skw, ckw in composite_problem3
        ])
    if composite_problem4:
        for skw, ckws in composite_problem4.iteritems():
            print "SKW '%s' does not exist but is " "referenced by:" % skw
            print "\n".join(["    %s" % ckw for ckw in ckws])
    if composite_problem5:
        print "\n".join([
            "CKW '%s' references a CKW '%s'." % kw for kw in composite_problem5
        ])
    if composite_problem6:
        print "\n".join([
            "CKW '%s' is not composed by SKW '%s'." % kw
            for kw in composite_problem6
        ])

    print "\n==== WARNINGS ===="

    if bad_notes:
        print("\nConcepts with bad notes: %d" % len(bad_notes))
        print "\n".join(["   '%s': '%s'" % note for note in bad_notes])
    if stemming_collisions:
        print(
            "\nFollowing keywords have unnecessary labels that have "
            "already been generated by BibClassify.")
        for subj in stemming_collisions:
            print "   %s:\n     %s\n     and %s" % subj

    print "\nFinished."
    sys.exit(0)
Пример #33
0
    Compares 2 composite keywords matches (composite_keyword, spans,
    components). First compare the occurrences, then the length of
    the word, at last the component counts.
    """
    count_comparison = cmp(ckw1_match[1], ckw0_match[1])
    if count_comparison:
        return count_comparison
    component_avg0 = sum(ckw0_match[2]) / len(ckw0_match[2])
    component_avg1 = sum(ckw1_match[2]) / len(ckw1_match[2])
    component_comparison =  cmp(component_avg1, component_avg0)
    if component_comparison:
        return component_comparison
    else:
        return cmp(len(ckw1_match[0].concept), len(ckw0_match[0].concept))

def _get_sorted_skw_matches(skw_matches, limit=20):
    """Returns a resized version of data structures of keywords to the
    given length."""
    sorted_keywords = list(skw_matches.items())
    sorted_keywords.sort(_skw_matches_comparator)
    return limit and sorted_keywords[:limit] or sorted_keywords

def _resize_ckw_matches(keywords, limit=20):
    """Returns a resized version of the composite_keywords list."""
    keywords.sort(_ckw_matches_comparator)
    return limit and keywords[:limit] or keywords

if __name__ == "__main__":
    write_message("ERROR: Please use bibclassify_cli from now on.",
        stream=sys.stderr, verbose=0)
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords
def _build_cache(source_file, no_cache=False):
    """Builds the cached data by parsing the RDF taxonomy file or a
    vocabulary file."""
    if rdflib.__version__ >= '2.3.2':
        store = rdflib.ConjunctiveGraph()
    else:
        store = rdflib.Graph()

    timer_start = time.clock()

    global single_keywords_by_subject
    global composite_keywords_by_subject
    single_keywords, composite_keywords = [], []

    try:
        write_message("INFO: Building RDFLib's conjunctive graph.",
            stream=sys.stderr, verbose=3)
        store.parse(source_file)
    except:
        # File is not a RDF file. We assume it is a controlled vocabulary.
        write_message("INFO: The ontology file is not a valid RDF file. "
            "Assuming it is a controlled vocabulary file.", stream=sys.stderr,
            verbose=3)
        filestream = open(source_file, "r")
        for line in filestream:
            keyword = line.strip()
            single_keywords.append(SingleKeyword(keyword))
    else:
        write_message("INFO: Building cache from RDF file %s." % source_file,
            stream=sys.stderr, verbose=3)
        # File is a RDF file.
        namespace = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")

        single_count = 0
        composite_count = 0

        for subject_object in store.subject_objects(namespace["prefLabel"]):
            # Keep only the single keywords.
            # FIXME: Remove or alter that condition in order to allow using
            # other ontologies that do not have this composite notion (such
            # as NASA-subjects.rdf)
            if not store.value(subject_object[0], namespace["compositeOf"],
                any=True):
                skw = SingleKeyword(subject_object[0], store=store,
                    namespace=namespace)
                single_keywords.append(skw)
                subject = str(subject_object[0]).split("#")[-1]
                single_keywords_by_subject[subject] = skw
                single_count += 1

        # Let's go through the composite keywords.
        for subject, pref_label in \
            store.subject_objects(namespace["prefLabel"]):
            # Keep only the single keywords.
            if store.value(subject, namespace["compositeOf"], any=True):
                strsubject = str(subject).split("#")[-1]
                composite_keywords.append(CompositeKeyword(store,
                    namespace, subject))
                composite_count += 1

        store.close()

    cached_data = {}
    cached_data["single"] = single_keywords
    cached_data["composite"] = composite_keywords
    cached_data["creation_time"] = time.gmtime()

    write_message("INFO: Building taxonomy... %d terms built in %.1f sec." %
        (len(single_keywords) + len(composite_keywords),
        time.clock() - timer_start), stream=sys.stderr, verbose=3)

    if not no_cache:
        # Serialize.
        try:
            filestream = open(_get_cache_path(source_file), "w")
        except IOError:
            # Impossible to write the cache.
            write_message("ERROR: Impossible to write cache to %s." %
                _get_cache_path(source_file), stream=sys.stderr, verbose=1)
            return (single_keywords, composite_keywords)
        else:
            write_message("INFO: Writing cache to file %s." %
                _get_cache_path(source_file), stream=sys.stderr, verbose=3)
            cPickle.dump(cached_data, filestream, 1)
            filestream.close()

    return (single_keywords, composite_keywords)
    Compares 2 composite keywords matches (composite_keyword, spans,
    components). First compare the occurrences, then the length of
    the word, at last the component counts.
    """
    count_comparison = cmp(ckw1_match[1], ckw0_match[1])
    if count_comparison:
        return count_comparison
    component_avg0 = sum(ckw0_match[2]) / len(ckw0_match[2])
    component_avg1 = sum(ckw1_match[2]) / len(ckw1_match[2])
    component_comparison =  cmp(component_avg1, component_avg0)
    if component_comparison:
        return component_comparison
    else:
        return cmp(len(ckw1_match[0].concept), len(ckw0_match[0].concept))

def _get_sorted_skw_matches(skw_matches, limit=20):
    """Returns a resized version of data structures of keywords to the
    given length."""
    sorted_keywords = list(skw_matches.items())
    sorted_keywords.sort(_skw_matches_comparator)
    return limit and sorted_keywords[:limit] or sorted_keywords

def _resize_ckw_matches(keywords, limit=20):
    """Returns a resized version of the composite_keywords list."""
    keywords.sort(_ckw_matches_comparator)
    return limit and keywords[:limit] or keywords

if __name__ == "__main__":
    write_message("ERROR: Please use bibclassify_cli from now on.",
        stream=sys.stderr, verbose=0)
                    "system.", stream=sys.stderr, verbose=1)
            cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document)
            filestream = os.popen(cmd)
        else:
            filestream = open(document, "r")
    except IOError, ex1:
        write_message("ERROR: Unable to read from file %s. (%s)" % (document,
            ex1.strerror), stream=sys.stderr, verbose=1)
        return None

    lines = [line.decode("utf-8") for line in filestream]
    filestream.close()

    if not _is_english_text('\n'.join(lines)):
        write_message("WARNING: It seems the file '%s' is unvalid and doesn't "
            "contain text. Please communicate this file to the Invenio "
            "team." % document, stream=sys.stderr, verbose=0)

    line_nb = len(lines)
    word_nb = 0
    for line in lines:
        word_nb += len(re.findall("\S+", line))

    # Discard lines that do not contain at least one word.
    lines = [line for line in lines if _ONE_WORD.search(line) is not None]

    if not remote:
        write_message("INFO: Local file has %d lines and %d words." % (line_nb,
            word_nb), stream=sys.stderr, verbose=3)

    return lines
def _build_cache(source_file, no_cache=False):
    """Builds the cached data by parsing the RDF taxonomy file or a
    vocabulary file."""
    if rdflib.__version__ >= '2.3.2':
        store = rdflib.ConjunctiveGraph()
    else:
        store = rdflib.Graph()

    timer_start = time.clock()

    global single_keywords_by_subject
    global composite_keywords_by_subject
    single_keywords, composite_keywords = [], []

    try:
        write_message("INFO: Building RDFLib's conjunctive graph.",
                      stream=sys.stderr,
                      verbose=3)
        store.parse(source_file)
    except:
        # File is not a RDF file. We assume it is a controlled vocabulary.
        write_message(
            "INFO: The ontology file is not a valid RDF file. "
            "Assuming it is a controlled vocabulary file.",
            stream=sys.stderr,
            verbose=3)
        filestream = open(source_file, "r")
        for line in filestream:
            keyword = line.strip()
            single_keywords.append(SingleKeyword(keyword))
    else:
        write_message("INFO: Building cache from RDF file %s." % source_file,
                      stream=sys.stderr,
                      verbose=3)
        # File is a RDF file.
        namespace = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")

        single_count = 0
        composite_count = 0

        for subject_object in store.subject_objects(namespace["prefLabel"]):
            # Keep only the single keywords.
            # FIXME: Remove or alter that condition in order to allow using
            # other ontologies that do not have this composite notion (such
            # as NASA-subjects.rdf)
            if not store.value(
                    subject_object[0], namespace["compositeOf"], any=True):
                skw = SingleKeyword(subject_object[0],
                                    store=store,
                                    namespace=namespace)
                single_keywords.append(skw)
                subject = str(subject_object[0]).split("#")[-1]
                single_keywords_by_subject[subject] = skw
                single_count += 1

        # Let's go through the composite keywords.
        for subject, pref_label in \
            store.subject_objects(namespace["prefLabel"]):
            # Keep only the single keywords.
            if store.value(subject, namespace["compositeOf"], any=True):
                strsubject = str(subject).split("#")[-1]
                composite_keywords.append(
                    CompositeKeyword(store, namespace, subject))
                composite_count += 1

        store.close()

    cached_data = {}
    cached_data["single"] = single_keywords
    cached_data["composite"] = composite_keywords
    cached_data["creation_time"] = time.gmtime()

    write_message("INFO: Building taxonomy... %d terms built in %.1f sec." %
                  (len(single_keywords) + len(composite_keywords),
                   time.clock() - timer_start),
                  stream=sys.stderr,
                  verbose=3)

    if not no_cache:
        # Serialize.
        try:
            filestream = open(_get_cache_path(source_file), "w")
        except IOError:
            # Impossible to write the cache.
            write_message("ERROR: Impossible to write cache to %s." %
                          _get_cache_path(source_file),
                          stream=sys.stderr,
                          verbose=1)
            return (single_keywords, composite_keywords)
        else:
            write_message("INFO: Writing cache to file %s." %
                          _get_cache_path(source_file),
                          stream=sys.stderr,
                          verbose=3)
            cPickle.dump(cached_data, filestream, 1)
            filestream.close()

    return (single_keywords, composite_keywords)
def check_taxonomy(taxonomy):
    """Checks the consistency of the taxonomy and outputs a list of
    errors and warnings."""
    write_message("INFO: Building graph with Python RDFLib version %s" %
        rdflib.__version__, stream=sys.stdout, verbose=0)

    if rdflib.__version__ >= '2.3.2':
        store = rdflib.ConjunctiveGraph()
    else:
        store = rdflib.Graph()

    try:
        store.parse(taxonomy)
    except:
        write_message("ERROR: The taxonomy is not a valid RDF file. Are you "
            "trying to check a controlled vocabulary?", stream=sys.stdout,
            verbose=0)
        sys.exit(0)

    write_message("INFO: Graph was successfully built.", stream=sys.stdout,
        verbose=0)

    prefLabel = "prefLabel"
    hiddenLabel = "hiddenLabel"
    altLabel = "altLabel"
    composite = "composite"
    compositeOf = "compositeOf"
    note = "note"

    both_skw_and_ckw = []

    # Build a dictionary we will reason on later.
    uniq_subjects = {}
    for subject in store.subjects():
        uniq_subjects[subject] = None

    subjects = {}
    for subject in uniq_subjects:
        strsubject = str(subject).split("#Composite.")[-1]
        strsubject = strsubject.split("#")[-1]
        if (strsubject == "http://cern.ch/thesauri/HEPontology.rdf" or
            strsubject == "compositeOf"):
            continue
        components = {}
        for predicate, value in store.predicate_objects(subject):
            strpredicate = str(predicate).split("#")[-1]
            strobject = str(value).split("#Composite.")[-1]
            strobject = strobject.split("#")[-1]
            components.setdefault(strpredicate, []).append(strobject)
        if strsubject in subjects:
            both_skw_and_ckw.append(strsubject)
        else:
            subjects[strsubject] = components

    write_message("INFO: Taxonomy contains %s concepts." % len(subjects),
        stream=sys.stdout, verbose=0)

    no_prefLabel = []
    multiple_prefLabels = []
    bad_notes = []
    # Subjects with no composite or compositeOf predicate
    lonely = []
    both_composites = []
    bad_hidden_labels = {}
    bad_alt_labels = {}
    # Problems with composite keywords
    composite_problem1 = []
    composite_problem2 = []
    composite_problem3 = []
    composite_problem4 = {}
    composite_problem5 = []
    composite_problem6 = []

    stemming_collisions = []
    interconcept_collisions = {}

    for subject, predicates in subjects.iteritems():
        # No prefLabel or multiple prefLabels
        try:
            if len(predicates[prefLabel]) > 1:
                multiple_prefLabels.append(subject)
        except KeyError:
            no_prefLabel.append(subject)

        # Lonely and both composites.
        if not composite in predicates and not compositeOf in predicates:
            lonely.append(subject)
        elif composite in predicates and compositeOf in predicates:
            both_composites.append(subject)

        # Multiple or bad notes
        if note in predicates:
            bad_notes += [(subject, n) for n in predicates[note]
                          if n not in ('nostandalone', 'core')]

        # Bad hidden labels
        if hiddenLabel in predicates:
            for lbl in predicates[hiddenLabel]:
                if lbl.startswith("/") ^ lbl.endswith("/"):
                    bad_hidden_labels.setdefault(subject, []).append(lbl)

        # Bad alt labels
        if altLabel in predicates:
            for lbl in predicates[altLabel]:
                if len(re.findall("/", lbl)) >= 2 or ":" in lbl:
                    bad_alt_labels.setdefault(subject, []).append(lbl)

        # Check composite
        if composite in predicates:
            for ckw in predicates[composite]:
                if ckw in subjects:
                    if compositeOf in subjects[ckw]:
                        if not subject in subjects[ckw][compositeOf]:
                            composite_problem3.append((subject, ckw))
                    else:
                        if not ckw in both_skw_and_ckw:
                            composite_problem2.append((subject, ckw))
                else:
                    composite_problem1.append((subject, ckw))

        # Check compositeOf
        if compositeOf in predicates:
            for skw in predicates[compositeOf]:
                if skw in subjects:
                    if composite in subjects[skw]:
                        if not subject in subjects[skw][composite]:
                            composite_problem6.append((subject, skw))
                    else:
                        if not skw in both_skw_and_ckw:
                            composite_problem5.append((subject, skw))
                else:
                    composite_problem4.setdefault(skw, []).append(subject)

        # Check for stemmed labels
        if compositeOf in predicates:
            labels = (altLabel, hiddenLabel)
        else:
            labels = (prefLabel, altLabel, hiddenLabel)

        patterns = {}
        for label in [lbl for lbl in labels if lbl in predicates]:
            for expression in [expr for expr in predicates[label]
                                    if not _is_regex(expr)]:
                pattern = _get_regex_pattern(expression)
                interconcept_collisions.setdefault(pattern,
                    []).append((subject, label))
                if pattern in patterns:
                    stemming_collisions.append((subject,
                        patterns[pattern],
                        (label, expression)
                        ))
                else:
                    patterns[pattern] = (label, expression)

    print "\n==== ERRORS ===="

    if no_prefLabel:
        print "\nConcepts with no prefLabel: %d" % len(no_prefLabel)
        print "\n".join(["   %s" % subj for subj in no_prefLabel])
    if multiple_prefLabels:
        print ("\nConcepts with multiple prefLabels: %d" %
            len(multiple_prefLabels))
        print "\n".join(["   %s" % subj for subj in multiple_prefLabels])
    if both_composites:
        print ("\nConcepts with both composite properties: %d" %
            len(both_composites))
        print "\n".join(["   %s" % subj for subj in both_composites])
    if bad_hidden_labels:
        print "\nConcepts with bad hidden labels: %d" % len(bad_hidden_labels)
        for kw, lbls in bad_hidden_labels.iteritems():
            print "   %s:" % kw
            print "\n".join(["      '%s'" % lbl for lbl in lbls])
    if bad_alt_labels:
        print "\nConcepts with bad alt labels: %d" % len(bad_alt_labels)
        for kw, lbls in bad_alt_labels.iteritems():
            print "   %s:" % kw
            print "\n".join(["      '%s'" % lbl for lbl in lbls])
    if both_skw_and_ckw:
        print ("\nKeywords that are both skw and ckw: %d" %
            len(both_skw_and_ckw))
        print "\n".join(["   %s" % subj for subj in both_skw_and_ckw])

    print

    if composite_problem1:
        print "\n".join(["SKW '%s' references an unexisting CKW '%s'." %
            (skw, ckw) for skw, ckw in composite_problem1])
    if composite_problem2:
        print "\n".join(["SKW '%s' references a SKW '%s'." %
            (skw, ckw) for skw, ckw in composite_problem2])
    if composite_problem3:
        print "\n".join(["SKW '%s' is not composite of CKW '%s'." %
            (skw, ckw) for skw, ckw in composite_problem3])
    if composite_problem4:
        for skw, ckws in composite_problem4.iteritems():
            print "SKW '%s' does not exist but is " "referenced by:" % skw
            print "\n".join(["    %s" % ckw for ckw in ckws])
    if composite_problem5:
        print "\n".join(["CKW '%s' references a CKW '%s'." % kw
            for kw in composite_problem5])
    if composite_problem6:
        print "\n".join(["CKW '%s' is not composed by SKW '%s'." % kw
            for kw in composite_problem6])

    print "\n==== WARNINGS ===="

    if bad_notes:
        print ("\nConcepts with bad notes: %d" % len(bad_notes))
        print "\n".join(["   '%s': '%s'" % note for note in bad_notes])
    if stemming_collisions:
        print ("\nFollowing keywords have unnecessary labels that have "
            "already been generated by BibClassify.")
        for subj in stemming_collisions:
            print "   %s:\n     %s\n     and %s" % subj

    print "\nFinished."
    sys.exit(0)
        else:
            filestream = open(document, "r")
    except IOError, ex1:
        write_message("ERROR: Unable to read from file %s. (%s)" %
                      (document, ex1.strerror),
                      stream=sys.stderr,
                      verbose=1)
        return None

    lines = [line.decode("utf-8") for line in filestream]
    filestream.close()

    if not _is_english_text('\n'.join(lines)):
        write_message(
            "WARNING: It seems the file '%s' is unvalid and doesn't "
            "contain text. Please communicate this file to the Invenio "
            "team." % document,
            stream=sys.stderr,
            verbose=0)

    line_nb = len(lines)
    word_nb = 0
    for line in lines:
        word_nb += len(re.findall("\S+", line))

    # Discard lines that do not contain at least one word.
    lines = [line for line in lines if _ONE_WORD.search(line) is not None]

    if not remote:
        write_message("INFO: Local file has %d lines and %d words." %
                      (line_nb, word_nb),
                      stream=sys.stderr,
def get_author_keywords(skw_db, ckw_db, fulltext):
    """Finds out human defined keyowrds in a text string. Searches for
    the string "Keywords:" and its declinations and matches the
    following words."""
    timer_start = time.clock()

    split_string = CFG_BIBCLASSIFY_AUTHOR_KW_START.split(fulltext, 1)
    if len(split_string) == 1:
        write_message("INFO: Matching author keywords... No keywords found.",
                      stream=sys.stderr,
                      verbose=3)
        return None

    kw_string = split_string[1]

    for regex in CFG_BIBCLASSIFY_AUTHOR_KW_END:
        parts = regex.split(kw_string, 1)
        kw_string = parts[0]

    # We separate the keywords.
    author_keywords = CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION.split(kw_string)

    write_message("INFO: Matching author keywords... %d keywords found in "
                  "%.1f sec." %
                  (len(author_keywords), time.clock() - timer_start),
                  stream=sys.stderr,
                  verbose=3)

    out = {}
    for kw in author_keywords:
        # If the author keyword is an acronym with capital letters
        # separated by points, remove the points.
        if re.match('([A-Z].)+$', kw):
            kw = kw.replace('.', '')

        # First try with the keyword as such, then lower it.
        kw_with_spaces = ' %s ' % kw
        matching_skw = get_single_keywords(skw_db,
                                           kw_with_spaces,
                                           verbose=False)
        matching_ckw = get_composite_keywords(ckw_db,
                                              kw_with_spaces,
                                              matching_skw,
                                              verbose=False)

        if matching_skw or matching_ckw:
            out[kw] = (matching_skw, matching_ckw)
            continue

        lowkw = kw.lower()

        matching_skw = get_single_keywords(skw_db,
                                           ' %s ' % lowkw,
                                           verbose=False)
        matching_ckw = get_composite_keywords(ckw_db,
                                              ' %s ' % lowkw,
                                              matching_skw,
                                              verbose=False)

        out[kw] = (matching_skw, matching_ckw)

    return out
def get_composite_keywords(ckw_db, fulltext, skw_spans, verbose=True):
    """Returns a list of composite keywords bound with the number of
    occurrences found in the text string.
    Format of the output list is (composite keyword, count, component counts)."""
    timer_start = time.clock()

    # Build the list of composite candidates
    ckw_list = []
    skw_as_components = []

    for composite_keyword in ckw_db:
        # Counters for the composite keyword. First count is for the
        # number of occurrences in the whole document and second count
        # is for the human defined keywords.
        ckw_count = 0
        matched_spans = []

        # Check the alternative labels.
        for regex in composite_keyword.regex:
            for match in regex.finditer(fulltext):
                span = list(match.span())
                span[1] -= 1
                span = tuple(span)
                if not span in matched_spans:
                    ckw_count += 1
                    matched_spans.append(span)

        # Get the single keywords locations.
        try:
            components = composite_keyword.compositeof
        except AttributeError:
            print >> sys.stderr, (
                "Cached ontology is corrupted. Please "
                "remove the cached ontology in your temporary file.")
            sys.exit(1)
        try:
            spans = [skw_spans[component] for component in components]
        except KeyError:
            # The keyword components are not to be found in the text.
            # This is not a dramatic exception and we can safely ignore
            # it.
            pass
        else:
            ckw_spans = []
            for index in range(len(spans) - 1):
                if ckw_spans:
                    previous_spans = ckw_spans
                else:
                    previous_spans = spans[index]

                ckw_spans = []
                for new_span in [(span0, span1) for span0 in previous_spans
                                 for span1 in spans[index + 1]]:
                    span = _get_ckw_span(fulltext, new_span)
                    if span is not None:
                        ckw_spans.append(span)

            for span in [
                    span for span in ckw_spans if not span in matched_spans
            ]:
                ckw_count += 1
                matched_spans.append(span)

        if ckw_count:
            # Gather the component counts.
            component_counts = []
            for component in components:
                skw_as_components.append(component)
                # Get the single keyword count.
                try:
                    component_counts.append(len(skw_spans[component]))
                except KeyError:
                    component_counts.append(0)

            # Store the composite keyword
            ckw_list.append((composite_keyword, ckw_count, component_counts))

    # Remove the single keywords that appear as components from the list
    # of single keywords.
    for skw in skw_as_components:
        try:
            del skw_spans[skw]
        except KeyError:
            pass

    if verbose:
        write_message("INFO: Matching composite keywords... %d keywords found "
                      "in %.1f sec." %
                      (len(ckw_list), time.clock() - timer_start),
                      stream=sys.stderr,
                      verbose=3)

    return ckw_list
    """Downloads the ontology and stores it in CFG_CACHEDIR."""
    write_message("INFO: Copying remote ontology '%s' to file '%s'." %
                  (url, local_file),
                  stream=sys.stderr,
                  verbose=3)
    try:
        url_desc = urllib2.urlopen(url)
        file_desc = open(local_file, 'w')
        file_desc.write(url_desc.read())
        file_desc.close()
    except IOError, e:
        print e
        return False
    except:
        write_message("WARNING: Unable to download the ontology. '%s'" %
                      sys.exc_info()[0],
                      stream=sys.stderr,
                      verbose=2)
        return False
    else:
        write_message("INFO: Done copying.", stream=sys.stderr, verbose=3)
        return True


def _get_searchable_regex(basic=None, hidden=None):
    """Returns the searchable regular expressions for the single
    keyword."""
    # Hidden labels are used to store regular expressions.
    basic = basic or []
    hidden = hidden or []

    hidden_regex_dict = {}