def __handle_numeric_character_reference(source_text, new_index): """ Handle a character reference that is numeric in nature. """ original_reference = None new_index += 1 translated_reference = -1 if new_index < len(source_text) and ( source_text[new_index] in InlineHelper.__hex_character_reference_start_character): hex_char = source_text[new_index] new_index += 1 end_index, collected_string = ParserHelper.collect_while_one_of_characters( source_text, new_index, string.hexdigits) LOGGER.debug( "&#x>>a>>%s>>b>>%s>>%s", str(end_index), str(collected_string), str(len(source_text)), ) delta = end_index - new_index LOGGER.debug("delta>>%s>>", str(delta)) if 1 <= delta <= 6: translated_reference = int(collected_string, 16) new_string = ( InlineHelper.character_reference_start_character + InlineHelper.__numeric_character_reference_start_character + hex_char + collected_string) new_index = end_index else: end_index, collected_string = ParserHelper.collect_while_one_of_characters( source_text, new_index, string.digits) LOGGER.debug( "&#>>a>>%s>>b>>%s>>%s", str(end_index), str(collected_string), str(len(source_text)), ) delta = end_index - new_index LOGGER.debug("delta>>%s>>", str(delta)) if 1 <= delta <= 7: translated_reference = int(collected_string) new_string = ( InlineHelper.character_reference_start_character + InlineHelper.__numeric_character_reference_start_character + collected_string) new_index = end_index if (translated_reference >= 0 and new_index < len(source_text) and source_text[new_index] == InlineHelper.__character_reference_end_character): new_index += 1 original_reference = new_string + ";" if translated_reference == 0: new_string = InlineHelper.__invalid_reference_character_substitute else: new_string = chr(translated_reference) return new_string, new_index, original_reference
def extract_html_attribute_name(string_to_parse: str, string_index: int) -> int: """ Attempt to extract the attribute name from the provided string. """ string_to_parse_length = len(string_to_parse) if not ( string_index < string_to_parse_length and ( string_to_parse[string_index] in HtmlHelper.__attribute_start_characters ) ): return -1 new_string_index, __ = ParserHelper.collect_while_one_of_characters( string_to_parse, string_index + 1, HtmlHelper.__attribute_other_characters ) assert new_string_index is not None if new_string_index < string_to_parse_length and string_to_parse[ new_string_index ] in [ HtmlHelper.__html_attribute_name_value_separator, HtmlHelper.__html_attribute_separator, HtmlHelper.__html_tag_start, HtmlHelper.__html_tag_end, ]: return new_string_index return -1
def __parse_valid_uri_autolink( text_to_parse: str, line_number: int, column_number: int) -> Optional[UriAutolinkMarkdownToken]: """ Parse a possible uri autolink and determine if it is valid. """ if (InlineHelper.angle_bracket_start not in text_to_parse and text_to_parse[0] in string.ascii_letters): path_index, uri_scheme = ParserHelper.collect_while_one_of_characters( text_to_parse, 1, InlineHelper.__valid_scheme_characters) assert path_index is not None uri_scheme, text_to_parse_size = f"{text_to_parse[0]}{uri_scheme}", len( text_to_parse) if (2 <= len(uri_scheme) <= 32 and path_index < text_to_parse_size and text_to_parse[path_index] == InlineHelper.__scheme_end_character): path_index += 1 while path_index < text_to_parse_size: if ord(text_to_parse[path_index]) <= 32: break path_index += 1 if path_index == text_to_parse_size: return UriAutolinkMarkdownToken(text_to_parse, line_number, column_number) else: uri_scheme, path_index = "", -1 return None
def extract_link_destination(line_to_parse, new_index, is_blank_line): """ Extract the link reference definition's link destination. """ new_index, prefix_whitespace = ParserHelper.collect_while_one_of_characters( line_to_parse, new_index, Constants.whitespace) if new_index == len(line_to_parse) and not is_blank_line: return False, new_index, None, None, None, None LOGGER.debug("LD>>%s<<", line_to_parse[new_index:]) ( inline_link, pre_inline_link, new_index, inline_raw_link, ) = LinkHelper.__parse_link_destination(line_to_parse, new_index) if new_index == -1: return False, -1, None, None, None, None return ( True, new_index, inline_link, pre_inline_link, prefix_whitespace, inline_raw_link, )
def handle_character_reference(inline_request): """ Handle a generic character reference. """ inline_response = InlineResponse() inline_response.new_index = inline_request.next_index + 1 inline_response.new_string = "" if (inline_response.new_index < len(inline_request.source_text) and inline_request.source_text[inline_response.new_index] == InlineHelper.__numeric_character_reference_start_character): original_new_index = inline_response.new_index LOGGER.debug("here") ( inline_response.new_string, inline_response.new_index, inline_response.original_string, ) = InlineHelper.__handle_numeric_character_reference( inline_request.source_text, inline_response.new_index) inline_response.new_string_unresolved = ( InlineHelper.character_reference_start_character + inline_request.source_text[original_new_index:inline_response. new_index]) LOGGER.debug("here-->%s<--", inline_response.new_string) LOGGER.debug("here-->%s<--", inline_response.new_string_unresolved) else: LOGGER.debug("there") end_index, collected_string = ParserHelper.collect_while_one_of_characters( inline_request.source_text, inline_response.new_index, string.ascii_letters + string.digits, ) if collected_string: collected_string = ( InlineHelper.character_reference_start_character + collected_string) if (end_index < len(inline_request.source_text) and inline_request.source_text[end_index] == InlineHelper.__character_reference_end_character): end_index += 1 collected_string += InlineHelper.__character_reference_end_character original_collected_string = collected_string if collected_string in InlineHelper.__entity_map: inline_response.original_string = collected_string collected_string = InlineHelper.__entity_map[ collected_string] inline_response.new_string_unresolved = ( original_collected_string) inline_response.new_string = collected_string inline_response.new_index = end_index LOGGER.debug("there-->%s<--", inline_response.new_string) LOGGER.debug("there-->%s<--", inline_response.new_string_unresolved) else: inline_response.new_string = ( InlineHelper.character_reference_start_character) return inline_response
def __parse_raw_tag_name(text_to_parse: str, start_index: int) -> str: """ Parse a HTML tag name from the string. """ if ParserHelper.is_character_at_index_one_of( text_to_parse, start_index, HtmlHelper.__valid_tag_name_start ): index, __ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index + 1, HtmlHelper.__valid_tag_name_characters ) return text_to_parse[:index] return ""
def __is_front_matter_valid( collected_lines: List[str], ) -> Union[Dict[str, str], str]: ascii_letters_and_digits = f"{string.ascii_letters}{string.digits}_-" current_title = "" current_value = "" value_map: Dict[str, str] = {} for next_line in collected_lines: POGGER.debug("Next fm:>$s<", next_line) next_index, _ = ParserHelper.extract_whitespace(next_line, 0) assert next_index is not None if next_index >= 4: POGGER.debug("Indented line established.") if not current_title: return "Continuation line encountered before a keyword line." current_value += f"\n{next_line.strip()}" POGGER.debug("current_value>$<", current_value) else: if not next_line.strip(): return "Blank line encountered before end of metadata." POGGER.debug("Non-indented line established.") if current_title: POGGER.debug("Adding '$' as '$'.", current_title, current_value) value_map[current_title] = current_value ( next_index, collected_title, ) = ParserHelper.collect_while_one_of_characters( next_line, next_index, ascii_letters_and_digits) assert next_index is not None assert collected_title is not None current_title = collected_title if next_index < len( next_line) and next_line[next_index] == ":": current_value = next_line[next_index + 1:].strip() else: return "Newline did not start with `keyword:`." if current_title: POGGER.debug("Adding final '$' as '$'.", current_title, current_value) value_map[current_title.lower()] = current_value # This is specifically to trigger test_front_matter_20. assert current_title != "test" or current_value != "assert" if not value_map: return "No valid metadata header lines were found." return value_map
def test_simple_case_from_middle(): """ Make sure that we test a simple extraction from the middle of the string. """ # Arrange input_string = "this!is!a!test" start_index = 5 characters_to_match = "is" expected_output = (7, "is") # Act actual_output = ParserHelper.collect_while_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_empty_string_with_good_index(): """ Make sure that an empty string is handled properly with a good index """ # Arrange input_string = "" start_index = 0 characters_to_match = " !" expected_output = (0, "") # Act actual_output = ParserHelper.collect_while_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_simple_case_from_start(): """ Make sure that we test a simple extraction from the start of the string. """ # Arrange input_string = "tata is a test" start_index = 0 characters_to_match = "at" expected_output = (4, "tata") # Act actual_output = ParserHelper.collect_while_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_empty_string_with_bad_left_index(): """ Make sure that an empty string is handled properly with an index that is too far to the left. """ # Arrange input_string = "" start_index = -1 characters_to_match = " !" expected_output = (None, None) # Act actual_output = ParserHelper.collect_while_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def test_already_on_whitespace(): """ Make sure that we test extracting while already on a whitespace character. """ # Arrange input_string = "this!is!a!test" start_index = 9 characters_to_match = "xyz" expected_output = (9, "") # Act actual_output = ParserHelper.collect_while_one_of_characters( input_string, start_index, characters_to_match) # Assert assert expected_output == actual_output
def __parse_raw_declaration(text_to_parse): """ Parse a possible raw html declaration sequence, and return if it is valid. """ valid_raw_html = None if ParserHelper.is_character_at_index_one_of( text_to_parse, 0, HtmlHelper.__raw_declaration_start_character ): ( parse_index, declaration_name, ) = ParserHelper.collect_while_one_of_characters( text_to_parse, 1, HtmlHelper.__html_block_4_continued_start ) if declaration_name: whitespace_count, _ = ParserHelper.collect_while_character( text_to_parse, parse_index, HtmlHelper.__raw_declaration_whitespace ) if whitespace_count: valid_raw_html = text_to_parse return valid_raw_html
def __handle_non_numeric_character_reference( inline_request: InlineRequest, inline_response: InlineResponse, source_text_size: int, ) -> None: POGGER.debug("there") assert inline_response.new_index is not None end_index, collected_string = ParserHelper.collect_while_one_of_characters( inline_request.source_text, inline_response.new_index, InlineHelper.__ascii_letters_and_digits, ) if collected_string: assert end_index is not None collected_string = ( f"{InlineHelper.character_reference_start_character}{collected_string}" ) if (end_index < source_text_size and inline_request.source_text[end_index] == InlineHelper.__character_reference_end_character): end_index += 1 collected_string += InlineHelper.__character_reference_end_character if collected_string in InlineHelper.__entity_map: inline_response.new_string_unresolved = collected_string inline_response.original_string = collected_string collected_string = InlineHelper.__entity_map[ collected_string] inline_response.new_string, inline_response.new_index = ( collected_string, end_index, ) POGGER.debug("there-->$<--", inline_response.new_string) POGGER.debug("there-->$<--", inline_response.new_string_unresolved) else: inline_response.new_string = ( InlineHelper.character_reference_start_character)
def __parse_valid_uri_autolink(text_to_parse): """ Parse a possible uri autolink and determine if it is valid. """ uri_scheme = "" path_index = -1 if (InlineHelper.angle_bracket_start not in text_to_parse and text_to_parse[0] in string.ascii_letters): path_index, uri_scheme = ParserHelper.collect_while_one_of_characters( text_to_parse, 1, InlineHelper.__valid_scheme_characters) uri_scheme = text_to_parse[0] + uri_scheme if (2 <= len(uri_scheme) <= 32 and path_index < len(text_to_parse) and text_to_parse[path_index] == InlineHelper.__scheme_end_character): path_index += 1 while path_index < len(text_to_parse): if ord(text_to_parse[path_index]) <= 32: break path_index += 1 if path_index == len(text_to_parse): return UriAutolinkMarkdownToken(text_to_parse) return None
def handle_inline_backtick(inline_request): """ Handle the inline case of backticks for code spans. """ LOGGER.debug("before_collect>%s", str(inline_request.next_index)) ( new_index, extracted_start_backticks, ) = ParserHelper.collect_while_one_of_characters( inline_request.source_text, inline_request.next_index, InlineHelper.code_span_bounds, ) LOGGER.debug("after_collect>%s>%s", str(new_index), extracted_start_backticks) end_backtick_start_index = inline_request.source_text.find( extracted_start_backticks, new_index) while end_backtick_start_index != -1: ( end_backticks_index, end_backticks_attempt, ) = ParserHelper.collect_while_one_of_characters( inline_request.source_text, end_backtick_start_index, InlineHelper.code_span_bounds, ) if len(end_backticks_attempt) == len(extracted_start_backticks): break end_backtick_start_index = inline_request.source_text.find( extracted_start_backticks, end_backticks_index) inline_response = InlineResponse() if end_backtick_start_index == -1: inline_response.new_string = extracted_start_backticks inline_response.new_index = new_index else: between_text = inline_request.source_text[ new_index:end_backtick_start_index] LOGGER.debug( "after_collect>%s>>%s>>%s<<", between_text, str(end_backtick_start_index), inline_request.source_text[end_backtick_start_index:], ) leading_whitespace = "" trailing_whitespace = "" if (len(between_text) > 2 and (between_text[0] == " " or between_text[0] == "\n") and (between_text[-1] == " " or between_text[-1] == "\n")): stripped_between_attempt = between_text[1:-1] if len(stripped_between_attempt.strip()) != 0: leading_whitespace = between_text[0] trailing_whitespace = between_text[-1] between_text = stripped_between_attempt between_text = between_text.replace("\n", "\a\n\a \a") leading_whitespace = leading_whitespace.replace("\n", "\a\n\a \a") trailing_whitespace = trailing_whitespace.replace( "\n", "\a\n\a \a") between_text = InlineHelper.append_text("", between_text) LOGGER.debug("between_text>>%s<<", between_text) end_backtick_start_index += len(extracted_start_backticks) inline_response.new_string = "" inline_response.new_index = end_backtick_start_index inline_response.new_tokens = [ InlineCodeSpanMarkdownToken( between_text, extracted_start_backticks, leading_whitespace, trailing_whitespace, ) ] return inline_response
def __handle_numeric_character_reference_inner( source_text: str, new_index: int) -> Tuple[str, int, Optional[str]]: """ Handle a character reference that is numeric in nature. """ original_reference, new_index, translated_reference, source_text_size = ( None, new_index + 1, -1, len(source_text), ) if new_index < source_text_size and ( source_text[new_index] in InlineHelper.__hex_character_reference_start_character): hex_char = source_text[new_index] new_index += 1 end_index, collected_string = ParserHelper.collect_while_one_of_characters( source_text, new_index, string.hexdigits) assert end_index is not None assert collected_string is not None POGGER.debug( "&#x>>a>>$>>b>>$>>$", end_index, collected_string, source_text_size, ) delta = end_index - new_index POGGER.debug("delta>>$>>", delta) if 1 <= delta <= 6: translated_reference = int(collected_string, 16) new_string, new_index = ( f"{InlineHelper.character_reference_start_character}" + f"{InlineHelper.__numeric_character_reference_start_character}{hex_char}{collected_string}", end_index, ) else: end_index, collected_string = ParserHelper.collect_while_one_of_characters( source_text, new_index, string.digits) assert end_index is not None assert collected_string is not None POGGER.debug( "&#>>a>>$>>b>>$>>$", end_index, collected_string, source_text_size, ) delta = end_index - new_index POGGER.debug("delta>>$>>", delta) if 1 <= delta <= 7: translated_reference = int(collected_string) new_string, new_index = ( f"{InlineHelper.character_reference_start_character}" + f"{InlineHelper.__numeric_character_reference_start_character}{collected_string}", end_index, ) if (translated_reference >= 0 and new_index < source_text_size and source_text[new_index] == InlineHelper.__character_reference_end_character): new_index += 1 original_reference, new_string = f"{new_string};", ( InlineHelper.__invalid_reference_character_substitute if translated_reference == 0 else chr(translated_reference)) return new_string, new_index, original_reference
def handle_inline_backtick( inline_request: InlineRequest) -> InlineResponse: """ Handle the inline case of backticks for code spans. """ POGGER.debug("before_collect>$", inline_request.next_index) ( new_index, extracted_start_backticks, ) = ParserHelper.collect_while_one_of_characters( inline_request.source_text, inline_request.next_index, InlineHelper.code_span_bounds, ) POGGER.debug("after_collect>$>$", new_index, extracted_start_backticks) assert new_index is not None assert extracted_start_backticks is not None extracted_start_backticks_size, end_backtick_start_index = ( len(extracted_start_backticks), inline_request.source_text.find(extracted_start_backticks, new_index), ) while end_backtick_start_index != -1: ( end_backticks_index, end_backticks_attempt, ) = ParserHelper.collect_while_one_of_characters( inline_request.source_text, end_backtick_start_index, InlineHelper.code_span_bounds, ) assert end_backticks_attempt is not None if len(end_backticks_attempt) == extracted_start_backticks_size: break end_backtick_start_index = inline_request.source_text.find( extracted_start_backticks, end_backticks_index) inline_response = InlineHelper.__build_backtick_response( inline_request, end_backtick_start_index, extracted_start_backticks, new_index, extracted_start_backticks_size, ) assert inline_response.new_index is not None POGGER.debug( ">>delta_line_number>>$<<", inline_response.delta_line_number, ) POGGER.debug( ">>delta_column_number>>$<<", inline_response.delta_column_number, ) if inline_response.delta_line_number == -1: inline_response.delta_line_number, inline_response.delta_column_number = ( 0, inline_response.new_index - inline_request.next_index, ) return inline_response
def __parse_tag_attributes(text_to_parse, start_index): """ Handle the parsing of the attributes for an open tag. """ parse_index, _ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters ) end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, parse_index ) if ParserHelper.is_character_at_index( text_to_parse, end_name_index, HtmlHelper.__html_attribute_name_value_separator, ): ( value_start_index, extracted_whitespace, ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1) if ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_single, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_single, ) if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_single, ): return None, -1 value_end_index += 1 elif ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_double, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_double, ) if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_double, ): return None, -1 value_end_index += 1 else: value_end_index, _ = ParserHelper.collect_until_one_of_characters( text_to_parse, value_start_index, HtmlHelper.__unquoted_attribute_value_stop, ) end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, value_end_index ) return end_name_index, extracted_whitespace
def __parse_tag_attributes( text_to_parse: str, start_index: int ) -> Tuple[Optional[int], Optional[str]]: """ Handle the parsing of the attributes for an open tag. """ parse_index, _ = ParserHelper.collect_while_one_of_characters( text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters ) assert parse_index is not None end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, parse_index ) assert end_name_index is not None if ParserHelper.is_character_at_index( text_to_parse, end_name_index, HtmlHelper.__html_attribute_name_value_separator, ): ( value_start_index, extracted_whitespace, ) = ParserHelper.extract_any_whitespace(text_to_parse, end_name_index + 1) assert value_start_index is not None value_end_index: Optional[int] = None if ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_single, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_single, ) assert value_end_index is not None if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_single, ): return None, None value_end_index += 1 elif ParserHelper.is_character_at_index_one_of( text_to_parse, value_start_index, HtmlHelper.__html_attribute_value_double, ): value_end_index, _ = ParserHelper.collect_until_character( text_to_parse, value_start_index + 1, HtmlHelper.__html_attribute_value_double, ) assert value_end_index is not None if not ParserHelper.is_character_at_index( text_to_parse, value_end_index, HtmlHelper.__html_attribute_value_double, ): return None, None value_end_index += 1 else: value_end_index, _ = ParserHelper.collect_until_one_of_characters( text_to_parse, value_start_index, HtmlHelper.__unquoted_attribute_value_stop, ) assert value_end_index is not None end_name_index, extracted_whitespace = ParserHelper.extract_any_whitespace( text_to_parse, value_end_index ) return end_name_index, extracted_whitespace