示例#1
0
 def fix_fields(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the body, look for the
         beginning of the body.
        The other two states are toc_index (for toc and index entries) and
        bookmark.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'No matching state in module fields_small.py\n')
                     sys.stderr.write(self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "fields_small.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#2
0
 def process_pict(self):
     self.__make_dir()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 if not self.__in_pict:
                     to_print = self.__default(line, write_obj)
                     if to_print:
                         write_obj.write(line)
                 else:
                     to_print = self.__in_pict_func(line)
                     if to_print:
                         write_obj.write(line)
             if self.__already_found_pict:
                 self.__write_pic_obj.write("}\n")
                 self.__write_pic_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "pict.data")
         try:
             copy_obj.copy_file(self.__pict_file, "pict.rtf")
         except:
             pass
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     if self.__pict_count == 0:
         try:
             os.rmdir(self.__dir_name)
         except OSError:
             pass
示例#3
0
    def tokenize(self):
        """Main class for handling other methods. Reads the file \
        , uses method self.sub_reg to make basic substitutions,\
        and process tokens by itself"""
        # read
        with open_for_read(self.__file) as read_obj:
            input_file = read_obj.read()

        # process simple replacements and split giving us a correct list
        # remove '' and \n in the process
        tokens = self.__sub_reg_split(input_file)
        # correct unicode
        tokens = map(self.__unicode_process, tokens)
        # remove empty items created by removing \uc
        tokens = list(filter(lambda x: len(x) > 0, tokens))

        # write
        with open_for_write(self.__write_to) as write_obj:
            write_obj.write('\n'.join(tokens))
        # Move and copy
        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
示例#4
0
 def convert_fonts(self):
     """
     Required:
         nothing
     Returns:
         a dictionary indicating with values for special fonts
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is font_table, looke for individual fonts
         and add the number and font name to a dictionary. Also create a
         tag for each individual font in the font table.
         If the state is after the font table, look for lines with font
         info. Substitute a font name for a font number.
         """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'no matching state in module fonts.py\n' +
                         self.__state + '\n')
                 action(line)
     default_font_name = self.__font_table.get(self.__default_font_num)
     if not default_font_name:
         default_font_name = 'Not Defined'
     self.__special_font_dict['default-font'] = default_font_name
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "fonts.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__special_font_dict
 def make_paragraph_def(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state.
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         action = self.__state_dict.get(self.__state)
         if action is None:
             sys.stderr.write('no no matching state in module sections.py\n')
             sys.stderr.write(self.__state + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "paragraphs_def.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__body_style_strings
示例#6
0
 def fix_preamble(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. The state can either be defaut, the revision table, or
         the list table.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'no matching state in module preamble_rest.py\n' +
                         self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "preamble_div.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#7
0
 def add_brackets(self):
     """
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'No matching state in module add_brackets.py\n'
                         '%s\n' % self.__state)
                 action(line)
     # Check bad brackets
     if self.__check_brackets(self.__write_to):
         copy_obj = copy.Copy(bug_handler=self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "add_brackets.data")
         copy_obj.rename(self.__write_to, self.__file)
     else:
         if self.__run_level > 0:
             sys.stderr.write(
                 'Sorry, but this files has a mix of old and new RTF.\n'
                 'Some characteristics cannot be converted.\n')
     os.remove(self.__write_to)
示例#8
0
 def insert_info(self):
     """
     """
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         if line == 'mi<mk<tabl-start\n':
             if len(self.__table_data) > 0:
                 table_dict = self.__table_data[0]
                 self.__write_obj.write('mi<tg<open-att__<table')
                 keys = table_dict.keys()
                 for key in keys:
                     self.__write_obj.write('<%s>%s' %
                                            (key, table_dict[key]))
                 self.__write_obj.write('\n')
                 self.__table_data = self.__table_data[1:]
             else:
                 # this shouldn't happen!
                 if self.__run_level > 3:
                     msg = 'Not enough data for each table\n'
                     raise self.__bug_handler(msg)
                 self.__write_obj.write('mi<tg<open______<table\n')
         elif line == 'mi<mk<table-end_\n':
             self.__write_obj.write('mi<tg<close_____<table\n')
         self.__write_obj.write(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "table_info.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
 def make_preamble_divisions(self):
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         if self.__token_info == 'ob<nu<open-brack':
             self.__ob_count = line[-5:-1]
             self.__ob_group += 1
         if self.__token_info == 'cb<nu<clos-brack':
             self.__cb_count = line[-5:-1]
             self.__ob_group -= 1
         action = self.__state_dict.get(self.__state)
         if action is None:
             print(self.__state)
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "preamble_div.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__all_lists
示例#10
0
 def make_paragraphs(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the body, look for the
         beginning of the body.
         When the body is found, change the state to 'not_paragraph'. The
         only other state is 'paragraph'.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     try:
                         sys.stderr.write('no matching state in module paragraphs.py\n')
                         sys.stderr.write(self.__state + '\n')
                     except:
                         pass
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "paragraphs.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#11
0
 def group_styles(self):
     """
     Required:
         nothing
     Returns:
         original file will be changed
     Logic:
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         self.__get_style_name(line)
         action = self.__state_dict.get(self.__state)
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "group_styles.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#12
0
 def insert_info(self):
     """
     """
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         if line == 'mi<tg<close_____<style-table\n':
             if len(self.__list_of_styles) > 0:
                 self.__write_obj.write('mi<tg<open______<styles-in-body\n')
                 the_string = ''.join(self.__list_of_styles)
                 self.__write_obj.write(the_string)
                 self.__write_obj.write('mi<tg<close_____<styles-in-body\n')
             else:
                 # this shouldn't happen!
                 if self.__run_level > 3:
                     msg = 'Not enough data for each table\n'
                     raise self.__bug_handler(msg)
                 # why was this line even here?
                 # self.__write_obj.write('mi<tg<open______<table\n')
         self.__write_obj.write(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "body_styles.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#13
0
 def convert_styles(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the style table, look for the
         beginning of the style table.
         If the state is in the style table, create the style dictionary
         and print out the tags.
         If the state if afer the style table, look for lines with style
         info, and substitute the number with the name of the style.
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         action = self.__state_dict.get(self.__state)
         if action is None:
             sys.stderr.write('no matching state in module styles.py\n')
             sys.stderr.write(self.__state + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "styles.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#14
0
 def convert_colors(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the color table, look for the
         beginning of the color table.
         If the state is in the color table, create the color dictionary
         and print out the tags.
         If the state if afer the color table, look for lines with color
         info, and substitute the number with the hex number.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__line+=1
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     try:
                         sys.stderr.write('no matching state in module fonts.py\n')
                         sys.stderr.write(self.__state + '\n')
                     except:
                         pass
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "color.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#15
0
 def fix_info(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the information table, look for the
         beginning of the style table.
         If the state is in the information table, use other methods to
         parse the information
         style table, look for lines with style info, and substitute the
         number with the name of the style.  If the state if afer the
         information table, simply write the line to the output file.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'No matching state in module styles.py\n')
                     sys.stderr.write(self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "info.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#16
0
 def delete_info(self):
     """Main method for handling other methods. Read one line at
     a time, and determine whether to print the line based on the state."""
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 # ob<nu<open-brack<0001
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 # Get action to perform
                 action = self.__state_dict.get(self.__state)
                 if not action:
                     sys.stderr.write(
                         'No action in dictionary state is "%s" \n' %
                         self.__state)
                 # Print if allowed by action
                 if action(line):
                     self.__write_obj.write(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "delete_info.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__found_delete
    def process_tokens(self):
        """Main method for handling other methods. """
        line_count = 0
        with open_for_read(self.__file) as read_obj:
            with open_for_write(self.__write_to) as write_obj:
                for line in read_obj:
                    token = line.replace("\n", "")
                    line_count += 1
                    if line_count == 1 and token != '\\{':
                        msg = '\nInvalid RTF: document doesn\'t start with {\n'
                        raise self.__exception_handler(msg)
                    elif line_count == 2 and token[0:4] != '\\rtf':
                        msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
                        raise self.__exception_handler(msg)

                    the_index = token.find('\\ ')
                    if token is not None and the_index > -1:
                        msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
                            % line_count
                        raise self.__exception_handler(msg)
                    elif token[:1] == "\\":
                        line = self.process_cw(token)
                        if line is not None:
                            write_obj.write(line)
                    else:
                        fields = re.split(self.__utf_exp, token)
                        for field in fields:
                            if not field:
                                continue
                            if field[0:1] == '&':
                                write_obj.write('tx<ut<__________<%s\n' %
                                                field)
                            else:
                                write_obj.write('tx<nu<__________<%s\n' %
                                                field)

        if not line_count:
            msg = '\nInvalid RTF: file appears to be empty.\n'
            raise self.__exception_handler(msg)

        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)

        bad_brackets = self.__check_brackets(self.__file)
        if bad_brackets:
            msg = '\nInvalid RTF: document does not have matching brackets.\n'
            raise self.__exception_handler(msg)
        else:
            return self.__return_code
示例#18
0
 def combine_borders(self):
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as write_obj:
             for line in read_obj:
                 self.__first_five = line[0:5]
                 if self.__state == 'border':
                     self.__border_func(line, write_obj)
                 else:
                     write_obj.write(self.__default_func(line))
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "combine_borders.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#19
0
 def convert_to_tags(self):
     """
     Read in the file one line at a time. Get the important info, between
     [:16]. Check if this info matches a dictionary entry. If it does, call
     the appropriate function.
     The functions that are called:
         a text function for text
         an open function for open tags
         an open with attribute function for tags with attributes
         an empty with attribute function for tags that are empty but have
         attribtes.
         a closed function for closed tags.
         an empty tag function.
         """
     self.__initiate_values()
     with open_for_write(self.__write_to) as self.__write_obj:
         self.__write_dec()
         with open_for_read(self.__file) as read_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__token_info)
                 if action is not None:
                     action(line)
     # convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
     if self.__convert_utf or self.__bad_encoding:
         copy_obj = copy.Copy(bug_handler=self.__bug_handler)
         copy_obj.rename(self.__write_to, self.__file)
         with open_for_read(self.__file) as read_obj:
             with open_for_write(self.__write_to) as write_obj:
                 for line in read_obj:
                     write_obj.write(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#20
0
 def __convert_body(self):
     self.__state = 'body'
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__body_state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write('error no state found in hex_2_utf8',
                                      self.__state)
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#21
0
 def separate_footnotes(self):
     """
     Separate all the footnotes in an RTF file and put them at the bottom,
     where they are easier to process.  Each time a footnote is found,
     print all of its contents to a temporary file. Close both the main and
     temporary file. Print the footnotes from the temporary file to the
     bottom of the main file.
     """
     self.__initiate_sep_values()
     self.__footnote_holder = better_mktemp()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             with open_for_write(
                     self.__footnote_holder) as self.__write_to_foot_obj:
                 for line in read_obj:
                     self.__token_info = line[:16]
                     # keep track of opening and closing brackets
                     if self.__token_info == 'ob<nu<open-brack':
                         self.__ob_count = line[-5:-1]
                     if self.__token_info == 'cb<nu<clos-brack':
                         self.__cb_count = line[-5:-1]
                     # In the middle of footnote text
                     if self.__in_footnote:
                         self.__in_footnote_func(line)
                     # not in the middle of footnote text
                     else:
                         self.__default_sep(line)
     with open_for_read(self.__footnote_holder) as read_obj:
         with open_for_write(self.__write_to, append=True) as write_obj:
             write_obj.write('mi<mk<sect-close\n'
                             'mi<mk<body-close\n'
                             'mi<tg<close_____<section\n'
                             'mi<tg<close_____<body\n'
                             'mi<tg<close_____<doc\n'
                             'mi<mk<footnt-beg\n')
             for line in read_obj:
                 write_obj.write(line)
             write_obj.write('mi<mk<footnt-end\n')
     os.remove(self.__footnote_holder)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "footnote_separate.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#22
0
 def fix_endings(self):
     # read
     with open(self.__file, 'rb') as read_obj:
         input_file = read_obj.read()
     # calibre go from win and mac to unix
     input_file = input_file.replace(b'\r\n', b'\n')
     input_file = input_file.replace(b'\r', b'\n')
     # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
     if self.__replace_illegals:
         input_file = clean_ascii_chars(input_file)
     # write
     with open(self.__write_to, 'wb') as write_obj:
         write_obj.write(input_file)
     # copy
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "line_endings.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#23
0
 def fix_list_numbers(self):
     """
     Required:
         nothing
     Returns:
         original file will be changed
     Logic:
         Read in one line a time from the file. Keep track of opening and
         closing brackets. Determine the method ('action') by passing the
         state to the self.__state_dict.
         Simply print out the line to a temp file until an open bracket
         is found. Check the next line. If it is list-text, then start
         adding to the self.__list_chunk until the closing bracket is
         found.
         Next, look for an open bracket or text. When either is found,
         print out self.__list_chunk and the line.
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         if self.__token_info == 'ob<nu<open-brack':
             self.__ob_count = line[-5:-1]
         if self.__token_info == 'cb<nu<clos-brack':
             self.__cb_count = line[-5:-1]
         action = self.__state_dict.get(self.__state)
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "list_numbers.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#24
0
 def form_tags(self):
     """
     Requires:
         area--area to parse (list or non-list)
     Returns:
         nothing
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 token = line[0:-1]
                 self.__token_info = ''
                 if token == 'tx<mc<__________<rdblquote'\
                     or token == 'tx<mc<__________<ldblquote'\
                     or token == 'tx<mc<__________<lquote'\
                     or token == 'tx<mc<__________<rquote'\
                     or token == 'tx<mc<__________<emdash'\
                     or token == 'tx<mc<__________<endash'\
                     or token == 'tx<mc<__________<bullet':
                     self.__token_info = 'text'
                 else:
                     self.__token_info = line[:16]
                 self.__set_list_func(line)
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'No matching state in module inline.py\n')
                     sys.stderr.write(self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "inline.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#25
0
 def join_headers(self):
     """
     Join the footnotes from the bottom of the file and put them in their
     former places.  First, remove the footnotes from the bottom of the
     input file, outputting them to a temporary file. This creates two new
     files, one without footnotes, and one of just footnotes. Open both
     these files to read. When a marker is found in the main file, find the
     corresponding marker in the footnote file. Output the mix of body and
     footnotes to a third file.
     """
     if not self.__found_a_header:
         return
     self.__write_to2 = better_mktemp()
     self.__state = 'body'
     self.__get_headers()
     self.__join_from_temp()
     self.__write_obj.close()
     self.__read_from_head_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "header_join.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     os.remove(self.__header_holder)
示例#26
0
 def fix_fields(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the body, look for the
         beginning of the body.
         If the state is body, send the line to the body method.
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         if self.__token_info == 'ob<nu<open-brack':
             self.__ob_count = line[-5:-1]
         if self.__token_info == 'cb<nu<clos-brack':
             self.__cb_count = line[-5:-1]
         action = self.__state_dict.get(self.__state)
         if action is None:
             sys.stderr.write('no no matching state in module styles.py\n')
             sys.stderr.write(self.__state + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "fields_large.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
示例#27
0
 def parse_rtf(self):
     """
     Parse the file by calling on other classes.
     Requires:
         Nothing
     Returns:
         A parsed file in XML, either to standard output or to a file,
         depending on the value of 'output' when the instance was created.
     """
     self.__temp_file = self.__make_temp_file(self.__file)
     # if the self.__deb_dir is true, then create a copy object,
     # set the directory to write to, remove files, and copy
     # the new temporary file to this directory
     if self.__debug_dir:
         copy_obj = copy.Copy(bug_handler=RtfInvalidCodeException, )
         copy_obj.set_dir(self.__debug_dir)
         copy_obj.remove_files()
         copy_obj.copy_file(self.__temp_file, "original_file")
     # Function to check if bracket are well handled
     if self.__debug_dir or self.__run_level > 2:
         self.__check_brack_obj = check_brackets.CheckBrackets(
             file=self.__temp_file,
             bug_handler=RtfInvalidCodeException,
         )
     # convert Macintosh and Windows line endings to Unix line endings
     # why do this if you don't wb after?
     line_obj = line_endings.FixLineEndings(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
         replace_illegals=self.__replace_illegals,
     )
     return_value = line_obj.fix_endings()  # calibre return what?
     self.__return_code(return_value)
     tokenize_obj = tokenize.Tokenize(bug_handler=RtfInvalidCodeException,
                                      in_file=self.__temp_file,
                                      copy=self.__copy,
                                      run_level=self.__run_level)
     tokenize_obj.tokenize()
     process_tokens_obj = process_tokens.ProcessTokens(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
         exception_handler=InvalidRtfException,
     )
     try:
         return_value = process_tokens_obj.process_tokens()
     except InvalidRtfException as msg:
         # Check to see if the file is correctly encoded
         encode_obj = default_encoding.DefaultEncoding(
             in_file=self.__temp_file,
             run_level=self.__run_level,
             bug_handler=RtfInvalidCodeException,
             check_raw=True,
             default_encoding=self.__default_encoding,
         )
         platform, code_page, default_font_num = encode_obj.find_default_encoding(
         )
         check_encoding_obj = check_encoding.CheckEncoding(
             bug_handler=RtfInvalidCodeException, )
         enc = encode_obj.get_codepage()
         # TODO: to check if cp is a good idea or if I should use a dict to convert
         enc = 'cp' + enc
         msg = '%s\nException in token processing' % str(msg)
         if check_encoding_obj.check_encoding(self.__file, enc):
             file_name = self.__file if isinstance(self.__file, bytes) \
                                 else self.__file.encode('utf-8')
             msg += '\nFile %s does not appear to be correctly encoded.\n' % file_name
         try:
             os.remove(self.__temp_file)
         except OSError:
             pass
         raise InvalidRtfException(msg)
     delete_info_obj = delete_info.DeleteInfo(
         in_file=self.__temp_file,
         copy=self.__copy,
         bug_handler=RtfInvalidCodeException,
         run_level=self.__run_level,
     )
     # found destination means {\*\destination
     # if found, the RTF should be newer RTF
     found_destination = delete_info_obj.delete_info()
     self.__bracket_match('delete_data_info')
     # put picts in a separate file
     pict_obj = pict.Pict(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         orig_file=self.__file,
         out_file=self.__out_file,
         run_level=self.__run_level,
     )
     pict_obj.process_pict()
     self.__bracket_match('pict_data_info')
     combine_obj = combine_borders.CombineBorders(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     combine_obj.combine_borders()
     self.__bracket_match('combine_borders_info')
     footnote_obj = footnote.Footnote(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     footnote_obj.separate_footnotes()
     self.__bracket_match('separate_footnotes_info')
     header_obj = header.Header(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     header_obj.separate_headers()
     self.__bracket_match('separate_headers_info')
     list_numbers_obj = list_numbers.ListNumbers(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     list_numbers_obj.fix_list_numbers()
     self.__bracket_match('list_number_info')
     preamble_div_obj = preamble_div.PreambleDiv(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     list_of_lists = preamble_div_obj.make_preamble_divisions()
     self.__bracket_match('make_preamble_divisions')
     encode_obj = default_encoding.DefaultEncoding(
         in_file=self.__temp_file,
         run_level=self.__run_level,
         bug_handler=RtfInvalidCodeException,
         default_encoding=self.__default_encoding,
     )
     platform, code_page, default_font_num = encode_obj.find_default_encoding(
     )
     hex2utf_obj = hex_2_utf8.Hex2Utf8(
         in_file=self.__temp_file,
         copy=self.__copy,
         area_to_convert='preamble',
         char_file=self.__char_data,
         default_char_map=code_page,
         run_level=self.__run_level,
         bug_handler=RtfInvalidCodeException,
         invalid_rtf_handler=InvalidRtfException,
     )
     hex2utf_obj.convert_hex_2_utf8()
     self.__bracket_match('hex_2_utf_preamble')
     fonts_obj = fonts.Fonts(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         default_font_num=default_font_num,
         run_level=self.__run_level,
     )
     special_font_dict = fonts_obj.convert_fonts()
     self.__bracket_match('fonts_info')
     color_obj = colors.Colors(
         in_file=self.__temp_file,
         copy=self.__copy,
         bug_handler=RtfInvalidCodeException,
         run_level=self.__run_level,
     )
     color_obj.convert_colors()
     self.__bracket_match('colors_info')
     style_obj = styles.Styles(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     style_obj.convert_styles()
     self.__bracket_match('styles_info')
     info_obj = info.Info(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     info_obj.fix_info()
     default_font = special_font_dict.get('default-font')
     preamble_rest_obj = preamble_rest.Preamble(
         file=self.__temp_file,
         copy=self.__copy,
         bug_handler=RtfInvalidCodeException,
         platform=platform,
         default_font=default_font,
         code_page=code_page)
     preamble_rest_obj.fix_preamble()
     self.__bracket_match('preamble_rest_info')
     old_rtf_obj = OldRtf(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         run_level=self.__run_level,
     )
     # RTF can actually have destination groups and old RTF.
     # BAH!
     old_rtf = old_rtf_obj.check_if_old_rtf()
     if old_rtf:
         if self.__run_level > 5:
             msg = 'Older RTF\n' \
             'self.__run_level is "%s"\n' % self.__run_level
             raise RtfInvalidCodeException(msg)
         if self.__run_level > 1:
             sys.stderr.write('File could be older RTF...\n')
         if found_destination:
             if self.__run_level > 1:
                 sys.stderr.write('File also has newer RTF.\n'
                                  'Will do the best to convert...\n')
         add_brackets_obj = add_brackets.AddBrackets(
             in_file=self.__temp_file,
             bug_handler=RtfInvalidCodeException,
             copy=self.__copy,
             run_level=self.__run_level,
         )
         add_brackets_obj.add_brackets()
     fields_small_obj = fields_small.FieldsSmall(
         in_file=self.__temp_file,
         copy=self.__copy,
         bug_handler=RtfInvalidCodeException,
         run_level=self.__run_level,
     )
     fields_small_obj.fix_fields()
     self.__bracket_match('fix_small_fields_info')
     fields_large_obj = fields_large.FieldsLarge(
         in_file=self.__temp_file,
         copy=self.__copy,
         bug_handler=RtfInvalidCodeException,
         run_level=self.__run_level)
     fields_large_obj.fix_fields()
     self.__bracket_match('fix_large_fields_info')
     sections_obj = sections.Sections(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     sections_obj.make_sections()
     self.__bracket_match('sections_info')
     paragraphs_obj = paragraphs.Paragraphs(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         write_empty_para=self.__empty_paragraphs,
         run_level=self.__run_level,
     )
     paragraphs_obj.make_paragraphs()
     self.__bracket_match('paragraphs_info')
     default_font = special_font_dict['default-font']
     paragraph_def_obj = paragraph_def.ParagraphDef(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         default_font=default_font,
         run_level=self.__run_level,
     )
     list_of_styles = paragraph_def_obj.make_paragraph_def()
     body_styles_obj = body_styles.BodyStyles(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         list_of_styles=list_of_styles,
         run_level=self.__run_level,
     )
     body_styles_obj.insert_info()
     self.__bracket_match('body_styles_info')
     self.__bracket_match('paragraph_def_info')
     table_obj = table.Table(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     table_data = table_obj.make_table()
     self.__bracket_match('table_info')
     table_info_obj = table_info.TableInfo(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         table_data=table_data,
         run_level=self.__run_level,
     )
     table_info_obj.insert_info()
     self.__bracket_match('table__data_info')
     if self.__form_lists:
         make_list_obj = make_lists.MakeLists(
             in_file=self.__temp_file,
             bug_handler=RtfInvalidCodeException,
             copy=self.__copy,
             headings_to_sections=self.__headings_to_sections,
             run_level=self.__run_level,
             list_of_lists=list_of_lists,
         )
         make_list_obj.make_lists()
         self.__bracket_match('form_lists_info')
     if self.__headings_to_sections:
         headings_to_sections_obj = headings_to_sections.HeadingsToSections(
             in_file=self.__temp_file,
             bug_handler=RtfInvalidCodeException,
             copy=self.__copy,
             run_level=self.__run_level,
         )
         headings_to_sections_obj.make_sections()
         self.__bracket_match('headings_to_sections_info')
     if self.__group_styles:
         group_styles_obj = group_styles.GroupStyles(
             in_file=self.__temp_file,
             bug_handler=RtfInvalidCodeException,
             copy=self.__copy,
             wrap=1,
             run_level=self.__run_level,
         )
         group_styles_obj.group_styles()
         self.__bracket_match('group_styles_info')
     if self.__group_borders:
         group_borders_obj = group_borders.GroupBorders(
             in_file=self.__temp_file,
             bug_handler=RtfInvalidCodeException,
             copy=self.__copy,
             wrap=1,
             run_level=self.__run_level,
         )
         group_borders_obj.group_borders()
         self.__bracket_match('group_borders_info')
     inline_obj = inline.Inline(
         in_file=self.__temp_file,
         bug_handler=RtfInvalidCodeException,
         copy=self.__copy,
         run_level=self.__run_level,
     )
     inline_obj.form_tags()
     self.__bracket_match('inline_info')
     hex2utf_obj.update_values(
         file=self.__temp_file,
         area_to_convert='body',
         copy=self.__copy,
         char_file=self.__char_data,
         convert_caps=self.__convert_caps,
         convert_symbol=self.__convert_symbol,
         convert_wingdings=self.__convert_wingdings,
         convert_zapf=self.__convert_zapf,
         symbol=1,
         wingdings=1,
         dingbats=1,
     )
     hex2utf_obj.convert_hex_2_utf8()
     header_obj.join_headers()
     footnote_obj.join_footnotes()
     tags_obj = convert_to_tags.ConvertToTags(
         in_file=self.__temp_file,
         copy=self.__copy,
         dtd_path=self.__dtd_path,
         indent=self.__indent,
         run_level=self.__run_level,
         no_dtd=self.__no_dtd,
         encoding=encode_obj.get_codepage(),
         bug_handler=RtfInvalidCodeException,
     )
     tags_obj.convert_to_tags()
     output_obj = output.Output(
         file=self.__temp_file,
         orig_file=self.__file,
         output_dir=self.__out_dir,
         out_file=self.__out_file,
     )
     output_obj.output()
     os.remove(self.__temp_file)
     return self.__exit_level