def is_olist_start( parser_state, line_to_parse, start_index, extracted_whitespace, skip_whitespace_check=False, adj_ws=None, ): """ Determine if we have the start of an numbered or ordered list. """ is_start = False end_whitespace_index = -1 index = None my_count = None if adj_ws is None: adj_ws = extracted_whitespace if (ParserHelper.is_length_less_than_or_equal_to(adj_ws, 3) or skip_whitespace_check ) and ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, string.digits): index = start_index while ParserHelper.is_character_at_index_one_of( line_to_parse, index, string.digits): index += 1 my_count = index - start_index olist_index_number = line_to_parse[start_index:index] LOGGER.debug("olist?%s<<count>>%s<<", olist_index_number, str(my_count)) LOGGER.debug("olist>>%s", str(line_to_parse[index])) LOGGER.debug("index+1>>%s>>len>>%s", str(index + 1), str(len(line_to_parse))) end_whitespace_index, _ = ParserHelper.extract_whitespace( line_to_parse, index + 1) LOGGER.debug( "end_whitespace_index>>%s>>len>>%s>>%s", str(end_whitespace_index), str(len(line_to_parse)), olist_index_number, ) if (my_count <= 9 and ParserHelper.is_character_at_index_one_of( line_to_parse, index, ListBlockProcessor.__olist_start_characters) and not (parser_state.token_stack[-1].is_paragraph and not parser_state.token_stack[-2].is_list and ((end_whitespace_index == len(line_to_parse)) or olist_index_number != "1")) and (ParserHelper.is_character_at_index_whitespace( line_to_parse, index + 1) or ((index + 1) == len(line_to_parse)))): is_start = True LOGGER.debug("is_olist_start>>result>>%s", str(is_start)) return is_start, index, my_count, end_whitespace_index
def __parse_raw_tag_name(text_to_parse, start_index): """ Parse a HTML tag name from the string. """ tag_name = "" if ParserHelper.is_character_at_index_one_of( text_to_parse, start_index, HtmlHelper.__valid_tag_name_start ): index = start_index + 1 while ParserHelper.is_character_at_index_one_of( text_to_parse, index, HtmlHelper.__valid_tag_name_characters ): index += 1 tag_name = text_to_parse[0:index] return tag_name
def parse_setext_headings( parser_state, position_marker, extracted_whitespace, this_bq_count, stack_bq_count, ): """ Handle the parsing of an setext heading. """ new_tokens = [] if (ParserHelper.is_length_less_than_or_equal_to( extracted_whitespace, 3) and ParserHelper.is_character_at_index_one_of( position_marker.text_to_parse, position_marker.index_number, LeafBlockProcessor.__setext_characters, ) and parser_state.token_stack[-1].is_paragraph and (this_bq_count == stack_bq_count)): _, collected_to_index = ParserHelper.collect_while_character( position_marker.text_to_parse, position_marker.index_number, position_marker.text_to_parse[position_marker.index_number], ) ( after_whitespace_index, extra_whitespace_after_setext, ) = ParserHelper.extract_whitespace(position_marker.text_to_parse, collected_to_index) if after_whitespace_index == len(position_marker.text_to_parse): # This is unusual. Normally, close_open_blocks is used to close off # blocks based on the stack token. However, since the setext takes # the last paragraph of text (see case 61) and translates it # into a heading, this has to be done separately, as there is no # stack token to close. new_tokens.append( EndMarkdownToken( MarkdownToken.token_setext_heading, extracted_whitespace, extra_whitespace_after_setext, None, )) token_index = len(parser_state.token_document) - 1 while not parser_state.token_document[token_index].is_paragraph: token_index -= 1 replacement_token = SetextHeadingMarkdownToken( position_marker.text_to_parse[ position_marker.index_number], collected_to_index - position_marker.index_number, parser_state.token_document[token_index].extra_data, position_marker, parser_state.token_document[token_index], ) parser_state.token_document[token_index] = replacement_token del parser_state.token_stack[-1] return new_tokens
def parse_setext_headings( parser_state: ParserState, position_marker: PositionMarker, extracted_whitespace: Optional[str], block_quote_data: BlockQuoteData, ) -> List[MarkdownToken]: """ Handle the parsing of an setext heading. """ new_tokens: List[MarkdownToken] = [] assert extracted_whitespace is not None if ( ParserHelper.is_length_less_than_or_equal_to(extracted_whitespace, 3) and ParserHelper.is_character_at_index_one_of( position_marker.text_to_parse, position_marker.index_number, LeafBlockProcessor.__setext_characters, ) and parser_state.token_stack[-1].is_paragraph and (block_quote_data.current_count == block_quote_data.stack_count) ): is_paragraph_continuation = ( LeafBlockProcessor.__adjust_continuation_for_active_list( parser_state, position_marker ) ) _, collected_to_index = ParserHelper.collect_while_character( position_marker.text_to_parse, position_marker.index_number, position_marker.text_to_parse[position_marker.index_number], ) assert collected_to_index is not None ( after_whitespace_index, extra_whitespace_after_setext, ) = ParserHelper.extract_whitespace( position_marker.text_to_parse, collected_to_index ) if not is_paragraph_continuation and after_whitespace_index == len( position_marker.text_to_parse ): LeafBlockProcessor.__create_setext_token( parser_state, position_marker, collected_to_index, new_tokens, extracted_whitespace, extra_whitespace_after_setext, ) return new_tokens
def __parse_raw_tag_name(text_to_parse: str, start_index: int) -> str: """ Parse a HTML tag name from the string. """ if ParserHelper.is_character_at_index_one_of( text_to_parse, start_index, HtmlHelper.__valid_tag_name_start ): index, __ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index + 1, HtmlHelper.__valid_tag_name_characters ) return text_to_parse[:index] return ""
def is_thematic_break( line_to_parse: str, start_index: int, extracted_whitespace: Optional[str], skip_whitespace_check: bool = False, whitespace_allowed_between_characters: bool = True, ) -> Tuple[Optional[str], Optional[int]]: """ Determine whether or not we have a thematic break. """ assert extracted_whitespace is not None thematic_break_character, end_of_break_index = None, None is_thematic_character = ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, LeafBlockProcessor.__thematic_break_characters ) POGGER.debug("skip_whitespace_check>>$", skip_whitespace_check) POGGER.debug("is_thematic_character>>$", is_thematic_character) if ( ParserHelper.is_length_less_than_or_equal_to(extracted_whitespace, 3) or skip_whitespace_check ) and is_thematic_character: start_char, index, char_count, line_to_parse_size = ( line_to_parse[start_index], start_index, 0, len(line_to_parse), ) while index < line_to_parse_size: if ( whitespace_allowed_between_characters and ParserHelper.is_character_at_index_whitespace( line_to_parse, index ) ): index += 1 elif line_to_parse[index] == start_char: index += 1 char_count += 1 else: break # pragma: no cover POGGER.debug("char_count>>$", char_count) POGGER.debug("index>>$", index) POGGER.debug("line_to_parse_size>>$", line_to_parse_size) if char_count >= 3 and index == line_to_parse_size: thematic_break_character, end_of_break_index = start_char, index return thematic_break_character, end_of_break_index
def test_is_character_at_index_one_of_without_whitespace(): """ Make sure that a string without any characters at the index is handled properly. """ # Arrange input_string = "this is a test" start_index = 0 valid_characters = "abc" expected_output = False # Act actual_output = ParserHelper.is_character_at_index_one_of( input_string, start_index, valid_characters) # Assert assert expected_output == actual_output
def test_is_character_at_index_one_of_with_character_at_end(): """ Make sure that a string with one of the characters at the index is handled properly. """ # Arrange input_string = "this is a test!" start_index = len(input_string) - 1 valid_characters = "abc!" expected_output = True # Act actual_output = ParserHelper.is_character_at_index_one_of( input_string, start_index, valid_characters) # Assert assert expected_output == actual_output
def test_is_character_at_index_one_of_with_whitespace2(): """ Make sure that a string with another one of the characters present at the index is handled properly. """ # Arrange input_string = "c" start_index = 0 valid_characters = "abc" expected_output = True # Act actual_output = ParserHelper.is_character_at_index_one_of( input_string, start_index, valid_characters) # Assert assert expected_output == actual_output
def test_is_character_at_index_one_of_with_empty_string(): """ Make sure that an empty string is handled properly. """ # Arrange input_string = "" start_index = 0 valid_characters = "abc" expected_output = False # Act actual_output = ParserHelper.is_character_at_index_one_of( input_string, start_index, valid_characters) # Assert assert expected_output == actual_output
def is_ulist_start( parser_state, line_to_parse, start_index, extracted_whitespace, skip_whitespace_check=False, adj_ws=None, ): """ Determine if we have the start of an un-numbered list. """ LOGGER.debug("is_ulist_start>>pre>>") is_start = False after_all_whitespace_index = -1 if adj_ws is None: adj_ws = extracted_whitespace if ((ParserHelper.is_length_less_than_or_equal_to(adj_ws, 3) or skip_whitespace_check) and ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, ListBlockProcessor.__ulist_start_characters) and (ParserHelper.is_character_at_index_whitespace( line_to_parse, start_index + 1) or ((start_index + 1) == len(line_to_parse)))): LOGGER.debug("is_ulist_start>>mid>>") after_all_whitespace_index, _ = ParserHelper.extract_whitespace( line_to_parse, start_index + 1) LOGGER.debug( "after_all_whitespace_index>>%s>>len>>%s", str(after_all_whitespace_index), str(len(line_to_parse)), ) is_break, _ = LeafBlockProcessor.is_thematic_break( line_to_parse, start_index, extracted_whitespace) if not is_break and not ( parser_state.token_stack[-1].is_paragraph and not parser_state.token_stack[-2].is_list and (after_all_whitespace_index == len(line_to_parse))): is_start = True LOGGER.debug("is_ulist_start>>result>>%s", str(is_start)) return is_start, after_all_whitespace_index
def __is_link_reference_definition( parser_state: ParserState, line_to_parse: str, start_index: int, extracted_whitespace: Optional[str], ) -> bool: """ Determine whether or not we have the start of a link reference definition. """ if parser_state.token_stack[-1].is_paragraph: return False assert extracted_whitespace is not None if (ParserHelper.is_length_less_than_or_equal_to( extracted_whitespace, 3)) and ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, LinkReferenceDefinitionHelper.__lrd_start_character, ): remaining_line, continue_with_lrd = line_to_parse[start_index + 1:], True if (remaining_line and remaining_line[-1] == InlineHelper.backslash_character): remaining_line_size, start_index, found_index = ( len(remaining_line), 0, remaining_line.find(InlineHelper.backslash_character, start_index), ) POGGER.debug(">>$<<$", remaining_line, remaining_line_size) POGGER.debug(">>$<<$", remaining_line, start_index) POGGER.debug(">>$<<", found_index) while found_index != -1 and found_index < ( remaining_line_size - 1): start_index = found_index + 2 POGGER.debug(">>$<<$", remaining_line, start_index) found_index = remaining_line.find( InlineHelper.backslash_character, start_index) POGGER.debug(">>$<<", found_index) POGGER.debug(">>>>>>>$<<", found_index) continue_with_lrd = found_index != remaining_line_size - 1 return continue_with_lrd return False
def __is_link_reference_definition(position_marker, line_to_parse, start_index, extracted_whitespace): """ Determine whether or not we have the start of a link reference definition. """ if position_marker.token_stack[-1].is_paragraph: return False if (ParserHelper.is_length_less_than_or_equal_to( extracted_whitespace, 3)) and ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, LinkReferenceDefinitionHelper.__lrd_start_character, ): return True return False
def is_fenced_code_block( line_to_parse: str, start_index: int, extracted_whitespace: Optional[str], skip_whitespace_check: bool = False, ) -> Tuple[bool, Optional[int], Optional[str], Optional[int]]: """ Determine if we have the start of a fenced code block. """ assert extracted_whitespace is not None if ( skip_whitespace_check or ParserHelper.is_length_less_than_or_equal_to(extracted_whitespace, 3) ) and ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, LeafBlockProcessor.__fenced_code_block_start_characters, ): POGGER.debug("ifcb:collected_count>>$<<$<<", line_to_parse, start_index) collected_count, new_index = ParserHelper.collect_while_character( line_to_parse, start_index, line_to_parse[start_index] ) POGGER.debug("ifcb:collected_count:$", collected_count) assert collected_count is not None assert new_index is not None ( non_whitespace_index, extracted_whitespace_before_info_string, ) = ParserHelper.extract_whitespace(line_to_parse, new_index) if collected_count >= 3: POGGER.debug("ifcb:True") return ( True, non_whitespace_index, extracted_whitespace_before_info_string, collected_count, ) return False, None, None, None
def __parse_raw_open_tag(text_to_parse): """ Parse the current line as if it is an open tag, and determine if it is valid. """ end_parse_index = -1 valid_raw_html = None tag_name = HtmlHelper.__parse_raw_tag_name(text_to_parse, 0) if tag_name: parse_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, len(tag_name) ) if extracted_whitespace: while ( extracted_whitespace and ParserHelper.is_character_at_index_one_of( text_to_parse, parse_index, HtmlHelper.__tag_attribute_name_start, ) ): ( parse_index, extracted_whitespace, ) = HtmlHelper.__parse_tag_attributes(text_to_parse, parse_index) if parse_index is None: return parse_index, extracted_whitespace if ParserHelper.is_character_at_index( text_to_parse, parse_index, HtmlHelper.__html_tag_start ): parse_index += 1 if ParserHelper.is_character_at_index( text_to_parse, parse_index, HtmlHelper.__html_tag_end ): valid_raw_html = text_to_parse[0:parse_index] end_parse_index = parse_index + 1 return valid_raw_html, end_parse_index
def __parse_raw_open_tag(text_to_parse: str) -> Tuple[Optional[str], int]: """ Parse the current line as if it is an open tag, and determine if it is valid. """ end_parse_index, valid_raw_html, tag_name = ( -1, None, HtmlHelper.__parse_raw_tag_name(text_to_parse, 0), ) if tag_name: parse_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, len(tag_name) ) assert parse_index is not None while extracted_whitespace and ParserHelper.is_character_at_index_one_of( text_to_parse, parse_index, HtmlHelper.__tag_attribute_name_start, ): ( parse_index, extracted_whitespace, ) = HtmlHelper.__parse_tag_attributes(text_to_parse, parse_index) if parse_index is None: return None, -1 if ParserHelper.is_character_at_index( text_to_parse, parse_index, HtmlHelper.__html_tag_start ): parse_index += 1 if ParserHelper.is_character_at_index( text_to_parse, parse_index, HtmlHelper.__html_tag_end ): valid_raw_html = text_to_parse[:parse_index] end_parse_index = parse_index + 1 return valid_raw_html, end_parse_index
def __check_for_special_html_blocks( line_to_parse: str, character_index: int ) -> Optional[str]: """ Check for the easy to spot special blocks: 2-5. """ if character_index >= len(line_to_parse): return None html_block_type = None if ParserHelper.is_character_at_index( line_to_parse, character_index, HtmlHelper.__html_block_2_to_5_start ): if ParserHelper.are_characters_at_index( line_to_parse, character_index + 1, HtmlHelper.__html_block_2_continued_start, ): html_block_type = HtmlHelper.html_block_2 elif ParserHelper.is_character_at_index_one_of( line_to_parse, character_index + 1, HtmlHelper.__html_block_4_continued_start, ): html_block_type = HtmlHelper.html_block_4 elif ParserHelper.are_characters_at_index( line_to_parse, character_index + 1, HtmlHelper.__html_block_5_continued_start, ): html_block_type = HtmlHelper.html_block_5 elif ParserHelper.is_character_at_index( line_to_parse, character_index, HtmlHelper.__html_block_3_continued_start, ): html_block_type = HtmlHelper.html_block_3 return html_block_type
def __parse_raw_declaration(text_to_parse): """ Parse a possible raw html declaration sequence, and return if it is valid. """ valid_raw_html = None if ParserHelper.is_character_at_index_one_of( text_to_parse, 0, HtmlHelper.__raw_declaration_start_character ): ( parse_index, declaration_name, ) = ParserHelper.collect_while_one_of_characters( text_to_parse, 1, HtmlHelper.__html_block_4_continued_start ) if declaration_name: whitespace_count, _ = ParserHelper.collect_while_character( text_to_parse, parse_index, HtmlHelper.__raw_declaration_whitespace ) if whitespace_count: valid_raw_html = text_to_parse return valid_raw_html
def is_thematic_break( line_to_parse, start_index, extracted_whitespace, skip_whitespace_check=False, ): """ Determine whether or not we have a thematic break. """ thematic_break_character = None end_of_break_index = None if (ParserHelper.is_length_less_than_or_equal_to( extracted_whitespace, 3) or skip_whitespace_check ) and ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, LeafBlockProcessor.__thematic_break_characters): start_char = line_to_parse[start_index] index = start_index char_count = 0 while index < len(line_to_parse): if ParserHelper.is_character_at_index_whitespace( line_to_parse, index): index += 1 elif line_to_parse[index] == start_char: index += 1 char_count += 1 else: break if char_count >= 3 and index == len(line_to_parse): thematic_break_character = start_char end_of_break_index = index return thematic_break_character, end_of_break_index
def is_fenced_code_block( line_to_parse, start_index, extracted_whitespace, skip_whitespace_check=False, ): """ Determine if we have the start of a fenced code block. """ if (ParserHelper.is_length_less_than_or_equal_to( extracted_whitespace, 3) or skip_whitespace_check ) and ParserHelper.is_character_at_index_one_of( line_to_parse, start_index, LeafBlockProcessor.__fenced_code_block_start_characters, ): LOGGER.debug("ifcb:collected_count>>%s<<%s<<", line_to_parse, str(start_index)) collected_count, new_index = ParserHelper.collect_while_character( line_to_parse, start_index, line_to_parse[start_index]) LOGGER.debug("ifcb:collected_count:%s", str(collected_count)) ( non_whitespace_index, extracted_whitespace_before_info_string, ) = ParserHelper.extract_whitespace(line_to_parse, new_index) if collected_count >= 3: LOGGER.debug("ifcb:True") return ( True, non_whitespace_index, extracted_whitespace_before_info_string, collected_count, ) return False, None, None, None
def __parse_tag_attributes( text_to_parse: str, start_index: int ) -> Tuple[Optional[int], Optional[str]]: """ Handle the parsing of the attributes for an open tag. """ parse_index, _ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters ) assert parse_index is not None end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, parse_index ) assert end_name_index is not None if ParserHelper.is_character_at_index( text_to_parse, end_name_index, HtmlHelper.__html_attribute_name_value_separator, ): ( value_start_index, extracted_whitespace, ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1) assert value_start_index is not None value_end_index: Optional[int] = None if ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_single, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_single, ) assert value_end_index is not None if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_single, ): return None, None value_end_index += 1 elif ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_double, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_double, ) assert value_end_index is not None if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_double, ): return None, None value_end_index += 1 else: value_end_index, _ = ParserHelper.collect_until_one_of_characters( text_to_parse, value_start_index, HtmlHelper.__unquoted_attribute_value_stop, ) assert value_end_index is not None end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, value_end_index ) return end_name_index, extracted_whitespace
def __parse_tag_attributes(text_to_parse, start_index): """ Handle the parsing of the attributes for an open tag. """ parse_index, _ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters ) end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, parse_index ) if ParserHelper.is_character_at_index( text_to_parse, end_name_index, HtmlHelper.__html_attribute_name_value_separator, ): ( value_start_index, extracted_whitespace, ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1) if ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_single, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_single, ) if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_single, ): return None, -1 value_end_index += 1 elif ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_double, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_double, ) if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_double, ): return None, -1 value_end_index += 1 else: value_end_index, _ = ParserHelper.collect_until_one_of_characters( text_to_parse, value_start_index, HtmlHelper.__unquoted_attribute_value_stop, ) end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, value_end_index ) return end_name_index, extracted_whitespace