예제 #1
0
def build_document_title_features(text: str, window_pre=3, window_post=3):
    """
    Get a document title given file text.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)

    # Parse all lines
    lines = text.splitlines()
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(
            build_title_features(lines,
                                 line_id,
                                 window_pre,
                                 window_post,
                                 include_doc=doc_distribution))

    # Get feature DF
    data_keys = set()
    for row in feature_data:
        for key in row:
            data_keys.add(key)
    columns = list(data_keys)
    columns.sort()
    feature_df = pandas.DataFrame(feature_data,
                                  columns=columns).fillna(-1).astype(int)
    return feature_df
예제 #2
0
def get_sections(text,
                 window_pre=3,
                 window_post=3,
                 score_threshold=0.5) -> Generator:
    """
    Get sections from text.
    NLP-based detection of sections.
    :param text:
    :param window_pre:
    :param window_post:
    :param score_threshold:
    :return:
    """

    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)
    lines = text.splitlines()
    test_feature_data = []
    for line_id in range(len(lines)):
        test_feature_data.append(
            build_section_break_features(lines,
                                         line_id,
                                         window_pre,
                                         window_post,
                                         include_doc=doc_distribution))

    # Predict page breaks
    test_feature_df = pandas.DataFrame(test_feature_data).fillna(-1)
    test_predicted_lines = SECTION_SEGMENTER_MODEL.predict_proba(
        test_feature_df)
    predicted_df = pandas.DataFrame(test_predicted_lines,
                                    columns=["prob_false", "prob_true"])
    section_breaks = predicted_df.loc[
        predicted_df["prob_true"] >= score_threshold, :].index.tolist()

    if len(section_breaks) > 0:
        # Get first break
        pos0 = 0
        pos1 = section_breaks[0]
        section = "\n".join(lines[pos0:pos1])
        if len(section.strip()) > 0:
            yield section

        # Iterate through section breaks
        for i in range(len(section_breaks) - 1):
            # Get breaks
            pos0 = section_breaks[i]
            pos1 = section_breaks[i + 1]
            # Get text
            section = "\n".join(lines[pos0:pos1])
            if len(section.strip()) > 0:
                yield section

        # Yield final section
        section = "\n".join(lines[section_breaks[-1]:])
        if len(section.strip()) > 0:
            yield section
예제 #3
0
def get_paragraphs(text: str, window_pre=3, window_post=3, score_threshold=0.5, return_spans: bool = False) \
        -> Generator:
    """
    Get paragraphs.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)
    lines, line_spans = splitlines_with_spans(text)
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(
            build_paragraph_break_features(lines,
                                           line_id,
                                           window_pre,
                                           window_post,
                                           include_doc=doc_distribution))

    # Predict page breaks
    feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int)
    predicted_lines = PARAGRAPH_SEGMENTER_MODEL.predict_proba(feature_df)
    predicted_df = pandas.DataFrame(predicted_lines,
                                    columns=["prob_false", "prob_true"])
    paragraph_breaks = predicted_df.loc[
        predicted_df["prob_true"] >= score_threshold, :].index.tolist()

    if len(paragraph_breaks) > 0:
        # Get first break
        pos0 = 0
        pos1 = paragraph_breaks[0]

        maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans,
                                           return_spans)
        if maybe_paragraph is not None:
            yield maybe_paragraph

        # Iterate through section breaks
        for i in range(len(paragraph_breaks) - 1):
            # Get breaks
            pos0 = paragraph_breaks[i]
            pos1 = paragraph_breaks[i + 1]
            # Get text
            maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans,
                                               return_spans)
            if maybe_paragraph is not None:
                yield maybe_paragraph

        # Yield final section
        pos0 = paragraph_breaks[-1]
        pos1 = None
        maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans,
                                           return_spans)
        if maybe_paragraph is not None:
            yield maybe_paragraph
예제 #4
0
def get_paragraphs(text,
                   window_pre=3,
                   window_post=3,
                   score_threshold=0.5) -> Generator:
    """
    Get paragraphs.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)
    lines = text.splitlines()
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(
            build_paragraph_break_features(lines,
                                           line_id,
                                           window_pre,
                                           window_post,
                                           include_doc=doc_distribution))

    # Predict page breaks
    feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int)
    predicted_lines = PARAGRAPH_SEGMENTER_MODEL.predict_proba(feature_df)
    predicted_df = pandas.DataFrame(predicted_lines,
                                    columns=["prob_false", "prob_true"])
    paragraph_breaks = predicted_df.loc[
        predicted_df["prob_true"] >= score_threshold, :].index.tolist()

    if len(paragraph_breaks) > 0:
        # Get first break
        pos0 = 0
        pos1 = paragraph_breaks[0]
        paragraph = "\n".join(lines[pos0:pos1])
        if len(paragraph.strip()) > 0:
            yield paragraph

        # Iterate through section breaks
        for i in range(len(paragraph_breaks) - 1):
            # Get breaks
            pos0 = paragraph_breaks[i]
            pos1 = paragraph_breaks[i + 1]
            # Get text
            paragraph = "\n".join(lines[pos0:pos1])
            if len(paragraph.strip()) > 0:
                yield paragraph

        # Yield final section
        paragraph = "\n".join(lines[paragraph_breaks[-1]:])
        if len(paragraph.strip()) > 0:
            yield paragraph.strip()
예제 #5
0
def build_document_document_year_features(text, window_pre=3, window_post=3):
    """
    Get a document year given file text.
    """
    # Get document character distribution
    doc_distribution = build_document_line_distribution(text)

    # Parse all lines
    lines = text.splitlines()
    feature_data = []

    for line_id in range(len(lines)):
        feature_data.append(build_document_year_features(lines, line_id, window_pre, window_post, include_doc=doc_distribution))

    # Get feature DF
    feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int)
    return feature_df