def build_document_title_features(text: str, window_pre=3, window_post=3): """ Get a document title given file text. """ # Get document character distribution doc_distribution = build_document_line_distribution(text) # Parse all lines lines = text.splitlines() feature_data = [] for line_id in range(len(lines)): feature_data.append( build_title_features(lines, line_id, window_pre, window_post, include_doc=doc_distribution)) # Get feature DF data_keys = set() for row in feature_data: for key in row: data_keys.add(key) columns = list(data_keys) columns.sort() feature_df = pandas.DataFrame(feature_data, columns=columns).fillna(-1).astype(int) return feature_df
def get_sections(text, window_pre=3, window_post=3, score_threshold=0.5) -> Generator: """ Get sections from text. NLP-based detection of sections. :param text: :param window_pre: :param window_post: :param score_threshold: :return: """ # Get document character distribution doc_distribution = build_document_line_distribution(text) lines = text.splitlines() test_feature_data = [] for line_id in range(len(lines)): test_feature_data.append( build_section_break_features(lines, line_id, window_pre, window_post, include_doc=doc_distribution)) # Predict page breaks test_feature_df = pandas.DataFrame(test_feature_data).fillna(-1) test_predicted_lines = SECTION_SEGMENTER_MODEL.predict_proba( test_feature_df) predicted_df = pandas.DataFrame(test_predicted_lines, columns=["prob_false", "prob_true"]) section_breaks = predicted_df.loc[ predicted_df["prob_true"] >= score_threshold, :].index.tolist() if len(section_breaks) > 0: # Get first break pos0 = 0 pos1 = section_breaks[0] section = "\n".join(lines[pos0:pos1]) if len(section.strip()) > 0: yield section # Iterate through section breaks for i in range(len(section_breaks) - 1): # Get breaks pos0 = section_breaks[i] pos1 = section_breaks[i + 1] # Get text section = "\n".join(lines[pos0:pos1]) if len(section.strip()) > 0: yield section # Yield final section section = "\n".join(lines[section_breaks[-1]:]) if len(section.strip()) > 0: yield section
def get_paragraphs(text: str, window_pre=3, window_post=3, score_threshold=0.5, return_spans: bool = False) \ -> Generator: """ Get paragraphs. """ # Get document character distribution doc_distribution = build_document_line_distribution(text) lines, line_spans = splitlines_with_spans(text) feature_data = [] for line_id in range(len(lines)): feature_data.append( build_paragraph_break_features(lines, line_id, window_pre, window_post, include_doc=doc_distribution)) # Predict page breaks feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int) predicted_lines = PARAGRAPH_SEGMENTER_MODEL.predict_proba(feature_df) predicted_df = pandas.DataFrame(predicted_lines, columns=["prob_false", "prob_true"]) paragraph_breaks = predicted_df.loc[ predicted_df["prob_true"] >= score_threshold, :].index.tolist() if len(paragraph_breaks) > 0: # Get first break pos0 = 0 pos1 = paragraph_breaks[0] maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans, return_spans) if maybe_paragraph is not None: yield maybe_paragraph # Iterate through section breaks for i in range(len(paragraph_breaks) - 1): # Get breaks pos0 = paragraph_breaks[i] pos1 = paragraph_breaks[i + 1] # Get text maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans, return_spans) if maybe_paragraph is not None: yield maybe_paragraph # Yield final section pos0 = paragraph_breaks[-1] pos1 = None maybe_paragraph = _maybe_paragraph(pos0, pos1, text, line_spans, return_spans) if maybe_paragraph is not None: yield maybe_paragraph
def get_paragraphs(text, window_pre=3, window_post=3, score_threshold=0.5) -> Generator: """ Get paragraphs. """ # Get document character distribution doc_distribution = build_document_line_distribution(text) lines = text.splitlines() feature_data = [] for line_id in range(len(lines)): feature_data.append( build_paragraph_break_features(lines, line_id, window_pre, window_post, include_doc=doc_distribution)) # Predict page breaks feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int) predicted_lines = PARAGRAPH_SEGMENTER_MODEL.predict_proba(feature_df) predicted_df = pandas.DataFrame(predicted_lines, columns=["prob_false", "prob_true"]) paragraph_breaks = predicted_df.loc[ predicted_df["prob_true"] >= score_threshold, :].index.tolist() if len(paragraph_breaks) > 0: # Get first break pos0 = 0 pos1 = paragraph_breaks[0] paragraph = "\n".join(lines[pos0:pos1]) if len(paragraph.strip()) > 0: yield paragraph # Iterate through section breaks for i in range(len(paragraph_breaks) - 1): # Get breaks pos0 = paragraph_breaks[i] pos1 = paragraph_breaks[i + 1] # Get text paragraph = "\n".join(lines[pos0:pos1]) if len(paragraph.strip()) > 0: yield paragraph # Yield final section paragraph = "\n".join(lines[paragraph_breaks[-1]:]) if len(paragraph.strip()) > 0: yield paragraph.strip()
def build_document_document_year_features(text, window_pre=3, window_post=3): """ Get a document year given file text. """ # Get document character distribution doc_distribution = build_document_line_distribution(text) # Parse all lines lines = text.splitlines() feature_data = [] for line_id in range(len(lines)): feature_data.append(build_document_year_features(lines, line_id, window_pre, window_post, include_doc=doc_distribution)) # Get feature DF feature_df = pandas.DataFrame(feature_data).fillna(-1).astype(int) return feature_df