def flush_block(self) -> None: if self.in_body == 0: if self.last_start_tag.lower() == "title": self.set_title(self.text_buffer.strip()) self.clear_text_buffer() return if len(self.token_buffer.strip()) == 0: self.clear_text_buffer() return tokens = self.tokenize(self.token_buffer) num_words = 0 num_linked_words = 0 num_wrapped_lines = 0 current_line_length = -1 # don't count the first space max_line_length = 80 num_tokens = 0 num_words_current_line = 0 for token in tokens: if token == SpecialTokens.ANCHOR_TEXT_START: self.in_anchor_text = True elif token == SpecialTokens.ANCHOR_TEXT_END: self.in_anchor_text = False elif self.is_word(token): num_tokens += 1 num_words += 1 num_words_current_line += 1 if self.in_anchor_text: num_linked_words += 1 current_line_length += len(token) + 1 if current_line_length > max_line_length: num_wrapped_lines += 1 current_line_length = len(token) num_words_current_line = 1 else: num_tokens += 1 # if only special tokens (num_tokens excludes special tokens) if num_tokens == 0: self.clear_text_buffer() return if num_wrapped_lines == 0: num_words_in_wrapped_lines = num_words num_wrapped_lines = 1 else: num_words_in_wrapped_lines = num_words - num_words_current_line tb = TextBlock(self.text_buffer.strip(), self.current_contained_text_elements, num_words, num_linked_words, num_words_in_wrapped_lines, num_wrapped_lines, self.offset_blocks) self.current_contained_text_elements = set() self.offset_blocks += 1 self.clear_text_buffer() tb.tag_level = self.block_tag_level self.add_text_block(tb) self.block_tag_level = -1
def add_text_block(self, tb: TextBlock) -> None: for font_size in self.font_size_stack[::-1]: if font_size is not None: tb.add_label(f"font-{font_size}") break for label_stack in self.label_stacks: for labels in label_stack: labels.add_to(tb) self.text_blocks.append(tb)
def test_merge(): block1 = TextBlock("AA BB CC ", {0}, 3, 3, 3, 1, 0) block2 = TextBlock("DD EE FF GG HH II JJ .", {1}, 6, 0, 6, 2, 1) block1.add_labels(DefaultLabels.MIGHT_BE_CONTENT) block2.add_labels(DefaultLabels.ARTICLE_METADATA) block1.merge_next(block2) assert block1.text == "AA BB CC \nDD EE FF GG HH II JJ ." assert block1.num_words == 9 assert block1.num_words_in_anchor_text == 3 assert round(abs(block1.link_density - 1.0 / 3.0), 7) == 0 assert block1.text_density == 3 assert block1.labels == {DefaultLabels.MIGHT_BE_CONTENT, DefaultLabels.ARTICLE_METADATA} assert block1.offset_blocks_start == 0 assert block1.offset_blocks_end == 1
def classify(self, prev_block: TextBlock, curr_block: TextBlock, next_block: TextBlock): if curr_block.link_density <= 0.333333: if prev_block.link_density <= 0.555556: if curr_block.text_density <= 9: if next_block.text_density <= 10: if prev_block.text_density <= 4: is_content = False else: is_content = True else: is_content = True else: if next_block.text_density == 0: is_content = False else: is_content = True else: if next_block.text_density <= 11: is_content = False else: is_content = True else: is_content = False changes = curr_block.is_content is is_content curr_block.is_content = is_content return changes
def classify(self, prev_block: TextBlock, curr_block: TextBlock, next_block: TextBlock): if curr_block.link_density <= 0.333333: if prev_block.link_density <= 0.555556: if curr_block.num_words <= 16: if next_block.num_words <= 15: if prev_block.num_words <= 4: is_content = False else: is_content = True else: is_content = True else: is_content = True else: if curr_block.num_words <= 40: if next_block.num_words <= 17: is_content = False else: is_content = True else: is_content = True else: is_content = False changes = curr_block.is_content is is_content curr_block.is_content = is_content return changes
def classify(self, prev_block: TextBlock, curr_block: TextBlock, next_block: TextBlock): cond1 = curr_block.link_density > 0 and next_block.num_words > 11 cond2 = curr_block.num_words > 19 cond3 = next_block.num_words > 6 and next_block.link_density == 0 and prev_block.link_density == 0 and \ (curr_block.num_words > 6 or prev_block.num_words > 7 or next_block.num_words > 19) is_content = cond1 or cond2 or cond3 changes = curr_block.is_content is is_content curr_block.is_content = is_content return changes
def process(self, doc: TextDocument) -> bool: changes = False blocks = doc.text_blocks blocks_new = [] for tb in blocks: text = tb.text paragraphs = self.NEWLINE_REGEX.split(text) if len(paragraphs) < 2: blocks_new.append(tb) continue is_content = tb.is_content labels = tb.labels for p in paragraphs: tb_p = TextBlock(p) tb_p.is_content = is_content tb_p.add_labels(labels) blocks_new.append(tb_p) changes = True if changes: doc.text_blocks = blocks_new return changes
def make_doc(words_arr, num_anchor_words_arr=None, is_content_arr=None, label_arr=None): text_blocks = [] for idx, words in enumerate(words_arr): if isinstance(words, int): num_words = words text = ' '.join(default_words[:num_words]) else: text = words num_words = text.count(' ') try: num_anchor_words = num_anchor_words_arr[idx] except (TypeError, IndexError): num_anchor_words = 0 block = TextBlock(text, set(), num_words, num_anchor_words, 0, 0, idx) try: block.is_content = is_content_arr[idx] except (TypeError, IndexError): pass try: label = label_arr[idx] if label is None: pass elif isinstance(label, list): for l in label: block.add_label(l) else: block.add_label(label) except (TypeError, IndexError): pass text_blocks.append(block) return TextDocument(text_blocks)
def add_labels_to(self, text_block: TextBlock): text_block.add_labels(self.labels)