Python BibRecDocs.get_text 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: invenio.legacy.bibdocfile.api

클래스/타입: BibRecDocs

메소드/함수: get_text

hotexamples.com에서의 예제들: 6

Python BibRecDocs.get_text - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 invenio.legacy.bibdocfile.api.BibRecDocs.get_text에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

BibRecDocs(23)

list_bibdocs(22)

get_docname(11)

add_new_file(10)

get_bibdoc(8)

get_bibdoc_names(6)

list_latest_files(6)

check_file_exists(3)

delete_bibdoc(3)

get_text(3)

propose_unique_docname(2)

deleted_p(2)

add_bibdoc(2)

add_new_version(2)

change_name(2)

has_docname_p(1)

merge_bibdocs(1)

add_new_format(1)

build_bibdoc_list(1)

get_xml_8564(1)

get_total_size_latest_version(1)

get_total_size(1)

fix(1)

check_duplicate_docnames(1)

get_file(1)

예제 #1

파일 보기

파일: solrutils_bibrank_indexer.py 프로젝트: pombredanne/invenio-3

def solr_add_range(lower_recid, upper_recid, tags_to_index,
                   next_commit_counter):
    """
    Adds the regarding field values of all records from the lower recid to the upper one to Solr.
    It preserves the fulltext information.
    """
    for recid in range(lower_recid, upper_recid + 1):
        if record_exists(recid):
            abstract = get_field_content_in_utf8(recid, 'abstract',
                                                 tags_to_index)
            author = get_field_content_in_utf8(recid, 'author', tags_to_index)
            keyword = get_field_content_in_utf8(recid, 'keyword',
                                                tags_to_index)
            title = get_field_content_in_utf8(recid, 'title', tags_to_index)
            try:
                bibrecdocs = BibRecDocs(recid)
                fulltext = unicode(bibrecdocs.get_text(), 'utf-8')
            except:
                fulltext = ''

            solr_add(recid, abstract, author, fulltext, keyword, title)
            next_commit_counter = solr_commit_if_necessary(next_commit_counter,
                                                           recid=recid)

    return next_commit_counter

예제 #2

파일 보기

def xapian_add_all(lower_recid, upper_recid):
    """
    Adds the regarding field values of all records from the lower recid to the upper one to Xapian.
    It preserves the fulltext information.
    """
    xapian_init_databases()
    for recid in range(lower_recid, upper_recid + 1):
        try:
            abstract = unicode(
                get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8')
        except:
            abstract = ""
        xapian_add(recid, "abstract", abstract)

        try:
            first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0]
            additional_authors = reduce(
                lambda x, y: x + " " + y,
                get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '')
            author = unicode(first_author + " " + additional_authors, 'utf-8')
        except:
            author = ""
        xapian_add(recid, "author", author)

        try:
            bibrecdocs = BibRecDocs(recid)
            fulltext = unicode(bibrecdocs.get_text(), 'utf-8')
        except:
            fulltext = ""
        xapian_add(recid, "fulltext", fulltext)

        try:
            keyword = unicode(
                reduce(lambda x, y: x + " " + y,
                       get_fieldvalues(recid, CFG_MARC_KEYWORD), ''), 'utf-8')
        except:
            keyword = ""
        xapian_add(recid, "keyword", keyword)

        try:
            title = unicode(get_fieldvalues(recid, CFG_MARC_TITLE)[0], 'utf-8')
        except:
            title = ""
        xapian_add(recid, "title", title)

예제 #3

파일 보기

파일: solrutils_bibrank_indexer.py 프로젝트: ffelsner/invenio

def solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter):
    """
    Adds the regarding field values of all records from the lower recid to the upper one to Solr.
    It preserves the fulltext information.
    """
    for recid in range(lower_recid, upper_recid + 1):
        if record_exists(recid):
            abstract        = get_field_content_in_utf8(recid, 'abstract', tags_to_index)
            author          = get_field_content_in_utf8(recid, 'author', tags_to_index)
            keyword         = get_field_content_in_utf8(recid, 'keyword', tags_to_index)
            title           = get_field_content_in_utf8(recid, 'title', tags_to_index)
            try:
                bibrecdocs  = BibRecDocs(recid)
                fulltext    = unicode(bibrecdocs.get_text(), 'utf-8')
            except:
                fulltext    = ''

            solr_add(recid, abstract, author, fulltext, keyword, title)
            next_commit_counter = solr_commit_if_necessary(next_commit_counter,recid=recid)

    return next_commit_counter

예제 #4

파일 보기

파일: xapianutils_bibrank_indexer.py 프로젝트: mhellmic/b2share

def xapian_add_all(lower_recid, upper_recid):
    """
    Adds the regarding field values of all records from the lower recid to the upper one to Xapian.
    It preserves the fulltext information.
    """
    xapian_init_databases()
    for recid in range(lower_recid, upper_recid + 1):
        try:
            abstract = unicode(get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8')
        except:
            abstract = ""
        xapian_add(recid, "abstract", abstract)

        try:
            first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0]
            additional_authors = reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '')
            author = unicode(first_author + " " + additional_authors, 'utf-8')
        except:
            author = ""
        xapian_add(recid, "author", author)

        try:
            bibrecdocs = BibRecDocs(recid)
            fulltext = unicode(bibrecdocs.get_text(), 'utf-8')
        except:
            fulltext = ""
        xapian_add(recid, "fulltext", fulltext)

        try:
            keyword = unicode(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_KEYWORD), ''), 'utf-8')
        except:
            keyword = ""
        xapian_add(recid, "keyword", keyword)

        try:
            title = unicode(get_fieldvalues(recid, CFG_MARC_TITLE)[0], 'utf-8')
        except:
            title = ""
        xapian_add(recid, "title", title)

예제 #5

파일 보기

파일: BibIndexFulltextTokenizer.py 프로젝트: mhellmic/b2share

    def get_words_from_fulltext(self, url_direct_or_indirect):
        """Returns all the words contained in the document specified by
           URL_DIRECT_OR_INDIRECT with the words being split by various
           SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
           set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
           file.  (This is interesting to index Indico for example.)  Note
           also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
           the fulltext file or an URL to a setlink-like page body that
           presents the links to be indexed.  In the latter case the
           URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
           to fulltext documents, for all knows file extensions as
           specified by global CONV_PROGRAMS config variable.
        """
        write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2)
        try:
            if bibdocfile_url_p(url_direct_or_indirect):
                write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2)
                bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                indexer = get_idx_indexer('fulltext')
                if indexer != 'native':
                    # A document might belong to multiple records
                    for rec_link in bibdoc.bibrec_links:
                        recid = rec_link["recid"]
                        # Adds fulltexts of all files once per records
                        if not recid in fulltext_added:
                            bibrecdocs = BibRecDocs(recid)
                            text = bibrecdocs.get_text()
                            if indexer == 'SOLR' and CFG_SOLR_URL:
                                solr_add_fulltext(recid, text)
                            elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                xapian_add(recid, 'fulltext', text)

                        fulltext_added.add(recid)
                    # we are relying on an external information retrieval system
                    # to provide full-text indexing, so dispatch text to it and
                    # return nothing here:
                    return []
                else:
                    text = ""
                    if hasattr(bibdoc, "get_text"):
                        text = bibdoc.get_text()
                    return self.tokenize_for_words_default(text)
            else:
                if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                    write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2)
                    return []
                write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2)
                urls_to_index = set()
                for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES):
                    if re.match(splash_re, url_direct_or_indirect):
                        write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2)
                        html = urllib2.urlopen(url_direct_or_indirect).read()
                        urls = get_links_in_html_page(html)
                        write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3)
                        for url in urls:
                            if re.match(url_re, url):
                                write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2)
                                urls_to_index.add(url)
                if not urls_to_index:
                    urls_to_index.add(url_direct_or_indirect)
                write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2)
                words = {}
                for url in urls_to_index:
                    tmpdoc = download_url(url)
                    file_converter_logger = get_file_converter_logger()
                    old_logging_level = file_converter_logger.getEffectiveLevel()
                    if self.verbose > 3:
                        file_converter_logger.setLevel(logging.DEBUG)
                    try:
                        try:
                            tmptext = convert_file(tmpdoc, output_format='.txt')
                            text = open(tmptext).read()
                            os.remove(tmptext)

                            indexer = get_idx_indexer('fulltext')
                            if indexer != 'native':
                                if indexer == 'SOLR' and CFG_SOLR_URL:
                                    solr_add_fulltext(None, text) # FIXME: use real record ID
                                if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                    #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                    pass
                                # we are relying on an external information retrieval system
                                # to provide full-text indexing, so dispatch text to it and
                                # return nothing here:
                                tmpwords = []
                            else:
                                tmpwords = self.tokenize_for_words_default(text)
                            words.update(dict(map(lambda x: (x, 1), tmpwords)))
                        except Exception as e:
                            message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e)
                            register_exception(prefix=message, alert_admin=True)
                            write_message(message, stream=sys.stderr)
                    finally:
                        os.remove(tmpdoc)
                        if self.verbose > 3:
                            file_converter_logger.setLevel(old_logging_level)
                return words.keys()
        except Exception as e:
            message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e)
            register_exception(prefix=message, alert_admin=True)
            write_message(message, stream=sys.stderr)
            return []

예제 #6

파일 보기

    def get_words_from_fulltext(self, url_direct_or_indirect):
        """Returns all the words contained in the document specified by
           URL_DIRECT_OR_INDIRECT with the words being split by various
           SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
           set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
           file.  (This is interesting to index Indico for example.)  Note
           also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
           the fulltext file or an URL to a setlink-like page body that
           presents the links to be indexed.  In the latter case the
           URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
           to fulltext documents, for all knows file extensions as
           specified by global CONV_PROGRAMS config variable.
        """
        write_message("... reading fulltext files from %s started" %
                      url_direct_or_indirect,
                      verbose=2)
        try:
            if bibdocfile_url_p(url_direct_or_indirect):
                write_message("... %s is an internal document" %
                              url_direct_or_indirect,
                              verbose=2)
                try:
                    bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                except InvenioBibDocFileError:
                    # Outdated 8564 tag
                    return []
                indexer = get_idx_indexer('fulltext')
                if indexer != 'native':
                    # A document might belong to multiple records
                    for rec_link in bibdoc.bibrec_links:
                        recid = rec_link["recid"]
                        # Adds fulltexts of all files once per records
                        if not recid in fulltext_added:
                            bibrecdocs = BibRecDocs(recid)
                            try:
                                text = bibrecdocs.get_text()
                            except InvenioBibDocFileError:
                                # Invalid PDF
                                continue
                            if indexer == 'SOLR' and CFG_SOLR_URL:
                                solr_add_fulltext(recid, text)
                            elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                xapian_add(recid, 'fulltext', text)

                        fulltext_added.add(recid)
                    # we are relying on an external information retrieval system
                    # to provide full-text indexing, so dispatch text to it and
                    # return nothing here:
                    return []
                else:
                    text = ""
                    if hasattr(bibdoc, "get_text"):
                        text = bibdoc.get_text()
                    return self.tokenize_for_words_default(text)
            else:
                if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                    write_message(
                        "... %s is external URL but indexing only local files"
                        % url_direct_or_indirect,
                        verbose=2)
                    return []
                write_message("... %s is an external URL" %
                              url_direct_or_indirect,
                              verbose=2)
                urls_to_index = set()
                for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES):
                    if re.match(splash_re, url_direct_or_indirect):
                        write_message("... %s is a splash page (%s)" %
                                      (url_direct_or_indirect, splash_re),
                                      verbose=2)
                        html = urllib2.urlopen(url_direct_or_indirect).read()
                        urls = get_links_in_html_page(html)
                        write_message(
                            "... found these URLs in %s splash page: %s" %
                            (url_direct_or_indirect, ", ".join(urls)),
                            verbose=3)
                        for url in urls:
                            if re.match(url_re, url):
                                write_message(
                                    "... will index %s (matched by %s)" %
                                    (url, url_re),
                                    verbose=2)
                                urls_to_index.add(url)
                if not urls_to_index:
                    urls_to_index.add(url_direct_or_indirect)
                write_message("... will extract words from %s" %
                              ', '.join(urls_to_index),
                              verbose=2)
                words = {}
                for url in urls_to_index:
                    tmpdoc = download_url(url)
                    file_converter_logger = get_file_converter_logger()
                    old_logging_level = file_converter_logger.getEffectiveLevel(
                    )
                    if self.verbose > 3:
                        file_converter_logger.setLevel(logging.DEBUG)
                    try:
                        try:
                            tmptext = convert_file(tmpdoc,
                                                   output_format='.txt')
                            text = open(tmptext).read()
                            os.remove(tmptext)

                            indexer = get_idx_indexer('fulltext')
                            if indexer != 'native':
                                if indexer == 'SOLR' and CFG_SOLR_URL:
                                    solr_add_fulltext(
                                        None,
                                        text)  # FIXME: use real record ID
                                if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                    #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                    pass
                                # we are relying on an external information retrieval system
                                # to provide full-text indexing, so dispatch text to it and
                                # return nothing here:
                                tmpwords = []
                            else:
                                tmpwords = self.tokenize_for_words_default(
                                    text)
                            words.update(dict(map(lambda x: (x, 1), tmpwords)))
                        except Exception as e:
                            message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (
                                url, url_direct_or_indirect, e)
                            register_exception(prefix=message,
                                               alert_admin=True)
                            write_message(message, stream=sys.stderr)
                    finally:
                        os.remove(tmpdoc)
                        if self.verbose > 3:
                            file_converter_logger.setLevel(old_logging_level)
                return words.keys()
        except Exception as e:
            message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (
                url_direct_or_indirect, e)
            register_exception(prefix=message, alert_admin=True)
            write_message(message, stream=sys.stderr)
            return []