Python build_document_line_distribution 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: lexnlp.nlp.en.segments.utils

메소드/함수: build_document_line_distribution

hotexamples.com에서의 예제들: 5

Python build_document_line_distribution - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 lexnlp.nlp.en.segments.utils.build_document_line_distribution에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def build_document_title_features(text: str, window_pre=3, window_post=3):
    """
    Get a document title given file text.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)

    # Parse all lines
    lines = text.splitlines()
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(
            build_title_features(lines,
                                 line_id,
                                 window_pre,
                                 window_post,
                                 include_doc=doc_distribution))

    # Get feature DF
    data_keys = set()
    for row in feature_data:
        for key in row:
            data_keys.add(key)
    columns = list(data_keys)
    columns.sort()
    feature_df = pandas.DataFrame(feature_data,
                                  columns=columns).fillna(-1).astype(int)
    return feature_df

예제 #2

파일 보기

파일: sections.py 프로젝트: suryak-cs/lexpredict-lexnlp

def get_sections(text,
                 window_pre=3,
                 window_post=3,
                 score_threshold=0.5) -> Generator:
    """
    Get sections from text.
    NLP-based detection of sections.
    :param text:
    :param window_pre:
    :param window_post:
    :param score_threshold:
    :return:
    """

    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)
    lines = text.splitlines()
    test_feature_data = []
    for line_id in range(len(lines)):
        test_feature_data.append(
            build_section_break_features(lines,
                                         line_id,
                                         window_pre,
                                         window_post,
                                         include_doc=doc_distribution))

    # Predict page breaks
    test_feature_df = pandas.DataFrame(test_feature_data).fillna(-1)
    test_predicted_lines = SECTION_SEGMENTER_MODEL.predict_proba(
        test_feature_df)
    predicted_df = pandas.DataFrame(test_predicted_lines,
                                    columns=["prob_false", "prob_true"])
    section_breaks = predicted_df.loc[
        predicted_df["prob_true"] >= score_threshold, :].index.tolist()

    if len(section_breaks) > 0:
        # Get first break
        pos0 = 0
        pos1 = section_breaks[0]
        section = "\n".join(lines[pos0:pos1])
        if len(section.strip()) > 0:
            yield section

        # Iterate through section breaks
        for i in range(len(section_breaks) - 1):
            # Get breaks
            pos0 = section_breaks[i]
            pos1 = section_breaks[i + 1]
            # Get text
            section = "\n".join(lines[pos0:pos1])
            if len(section.strip()) > 0:
                yield section

        # Yield final section
        section = "\n".join(lines[section_breaks[-1]:])
        if len(section.strip()) > 0:
            yield section

예제 #3

파일 보기

파일: paragraphs.py 프로젝트: rdamarapati/lexpredict-lexnlp

def get_paragraphs(text: str, window_pre=3, window_post=3, score_threshold=0.5, return_spans: bool = False) \
        -> Generator:
    """
    Get paragraphs.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)
    lines, line_spans = splitlines_with_spans(text)
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(
            build_paragraph_break_features(lines,
                                           line_id,
                                           window_pre,
                                           window_post,
                                           include_doc=doc_distribution))

    # Predict page breaks
    feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int)
    predicted_lines = PARAGRAPH_SEGMENTER_MODEL.predict_proba(feature_df)
    predicted_df = pandas.DataFrame(predicted_lines,
                                    columns=["prob_false", "prob_true"])
    paragraph_breaks = predicted_df.loc[
        predicted_df["prob_true"] >= score_threshold, :].index.tolist()

    if len(paragraph_breaks) > 0:
        # Get first break
        pos0 = 0
        pos1 = paragraph_breaks[0]

        maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans,
                                           return_spans)
        if maybe_paragraph is not None:
            yield maybe_paragraph

        # Iterate through section breaks
        for i in range(len(paragraph_breaks) - 1):
            # Get breaks
            pos0 = paragraph_breaks[i]
            pos1 = paragraph_breaks[i + 1]
            # Get text
            maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans,
                                               return_spans)
            if maybe_paragraph is not None:
                yield maybe_paragraph

        # Yield final section
        pos0 = paragraph_breaks[-1]
        pos1 = None
        maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans,
                                           return_spans)
        if maybe_paragraph is not None:
            yield maybe_paragraph

예제 #4

파일 보기

def get_paragraphs(text,
                   window_pre=3,
                   window_post=3,
                   score_threshold=0.5) -> Generator:
    """
    Get paragraphs.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)
    lines = text.splitlines()
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(
            build_paragraph_break_features(lines,
                                           line_id,
                                           window_pre,
                                           window_post,
                                           include_doc=doc_distribution))

    # Predict page breaks
    feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int)
    predicted_lines = PARAGRAPH_SEGMENTER_MODEL.predict_proba(feature_df)
    predicted_df = pandas.DataFrame(predicted_lines,
                                    columns=["prob_false", "prob_true"])
    paragraph_breaks = predicted_df.loc[
        predicted_df["prob_true"] >= score_threshold, :].index.tolist()

    if len(paragraph_breaks) > 0:
        # Get first break
        pos0 = 0
        pos1 = paragraph_breaks[0]
        paragraph = "\n".join(lines[pos0:pos1])
        if len(paragraph.strip()) > 0:
            yield paragraph

        # Iterate through section breaks
        for i in range(len(paragraph_breaks) - 1):
            # Get breaks
            pos0 = paragraph_breaks[i]
            pos1 = paragraph_breaks[i + 1]
            # Get text
            paragraph = "\n".join(lines[pos0:pos1])
            if len(paragraph.strip()) > 0:
                yield paragraph

        # Yield final section
        paragraph = "\n".join(lines[paragraph_breaks[-1]:])
        if len(paragraph.strip()) > 0:
            yield paragraph.strip()

예제 #5

파일 보기

def build_document_document_year_features(text, window_pre=3, window_post=3):
    """
    Get a document year given file text.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)

    # Parse all lines
    lines = text.splitlines()
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(build_document_year_features(lines, line_id, window_pre, window_post, include_doc=doc_distribution))

    # Get feature DF
    feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int)
    return feature_df