Python text_lines_from_local_file示例，bibclassify_text_extractor.text_lines_from_local_file Python示例

示例#1

0

显示文件

文件： bibclassify_engine.py 项目： pombredanne/invenio-old

def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Outputs the keywords for each source in sources."""


    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print "Input file: %s" % source

        output = get_keywords_from_text(text_lines,
                                          taxonomy_name,
                                          output_mode=output_mode,
                                          output_limit=output_limit,
                                          spires=spires,
                                          match_mode=match_mode,
                                          no_cache=no_cache,
                                          with_author_keywords=with_author_keywords,
                                          rebuild_cache=rebuild_cache,
                                          only_core_tags=only_core_tags,
                                          extract_acronyms=extract_acronyms
                                          )
        print output

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(entry,
                user_agent=bconfig.CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()

示例#2

0

显示文件

文件： bibclassify_engine.py 项目： pombredanne/invenio

def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)

示例#3

0

显示文件

文件： bibclassify_engine.py 项目： metandrey/invenio-metandrey

def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)

示例#4

0

显示文件

def get_keywords_from_local_file(
        local_file,
        taxonomy_name,
        output_mode="text",
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        spires=False,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        rebuild_cache=False,
        only_core_tags=False,
        extract_acronyms=False,
        **kwargs):
    """Outputs keywords reading a local file. Arguments and output are the same
    as for @see: get_keywords_from_text() """

    log.info("Analyzing keywords for local file %s." % local_file)
    text_lines = extractor.text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
                                  taxonomy_name,
                                  output_mode=output_mode,
                                  output_limit=output_limit,
                                  spires=spires,
                                  match_mode=match_mode,
                                  no_cache=no_cache,
                                  with_author_keywords=with_author_keywords,
                                  rebuild_cache=rebuild_cache,
                                  only_core_tags=only_core_tags,
                                  extract_acronyms=extract_acronyms)

示例#5

0

显示文件

文件： bibclassify_engine.py 项目： epfl-si/invenio-infoscience

def output_keywords_for_local_file(
        local_file,
        taxonomy,
        rebuild_cache=False,
        output_mode="text",
        output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        spires=False,
        verbose=None):
    """Outputs the keywords for a local file."""
    if verbose is not None:
        set_verbose_level(verbose)

    write_message("INFO: Analyzing keywords for local file %s." % local_file,
                  stream=sys.stderr,
                  verbose=3)
    text_lines = text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
                                  output_mode=output_mode,
                                  output_limit=output_limit,
                                  taxonomy=taxonomy,
                                  spires=spires,
                                  match_mode=match_mode,
                                  with_author_keywords=with_author_keywords,
                                  rebuild_cache=rebuild_cache,
                                  no_cache=no_cache)

示例#6

0

显示文件

文件： bibclassify_engine.py 项目： pombredanne/invenio

def output_keywords_for_local_file(local_file, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None):
    """Outputs the keywords for a local file."""
    if verbose is not None:
        set_verbose_level(verbose)

    write_message("INFO: Analyzing keywords for local file %s." % local_file,
        stream=sys.stderr, verbose=3)
    text_lines = text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
        output_mode=output_mode,
        output_limit=output_limit,
        taxonomy=taxonomy,
        spires=spires,
        match_mode=match_mode,
        with_author_keywords=with_author_keywords,
        rebuild_cache=rebuild_cache,
        no_cache=no_cache)

示例#7

0

显示文件

文件： bibclassify_engine.py 项目： pombredanne/invenio-old

def get_keywords_from_local_file(local_file, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs ):
    """Outputs keywords reading a local file. Arguments and output are the same
    as for @see: get_keywords_from_text() """

    log.info("Analyzing keywords for local file %s." % local_file)
    text_lines = extractor.text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
                                  taxonomy_name,
                                  output_mode=output_mode,
                                  output_limit=output_limit,
                                  spires=spires,
                                  match_mode=match_mode,
                                  no_cache=no_cache,
                                  with_author_keywords=with_author_keywords,
                                  rebuild_cache=rebuild_cache,
                                  only_core_tags=only_core_tags,
                                  extract_acronyms=extract_acronyms)

示例#8

0

显示文件

文件： bibclassify_engine.py 项目： pombredanne/invenio

def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords

示例#9

0

显示文件

文件： bibclassify_engine.py 项目： metandrey/invenio-metandrey

def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords

示例#10

0

显示文件

def output_keywords_for_sources(
        input_sources,
        taxonomy_name,
        output_mode="text",
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        spires=False,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        rebuild_cache=False,
        only_core_tags=False,
        extract_acronyms=False,
        **kwargs):
    """Outputs the keywords for each source in sources."""

    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print "Input file: %s" % source

        output = get_keywords_from_text(
            text_lines,
            taxonomy_name,
            output_mode=output_mode,
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms)
        print output

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(
                entry, user_agent=make_user_agent_string("BibClassify"))
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()