Пример #1
0
    def tokenize(self):
        """Main class for handling other methods. Reads the file \
        , uses method self.sub_reg to make basic substitutions,\
        and process tokens by itself"""
        # read
        with open_for_read(self.__file) as read_obj:
            input_file = read_obj.read()

        # process simple replacements and split giving us a correct list
        # remove '' and \n in the process
        tokens = self.__sub_reg_split(input_file)
        # correct unicode
        tokens = map(self.__unicode_process, tokens)
        # remove empty items created by removing \uc
        tokens = list(filter(lambda x: len(x) > 0, tokens))

        # write
        with open_for_write(self.__write_to) as write_obj:
            write_obj.write('\n'.join(tokens))
        # Move and copy
        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
Пример #2
0
 def fix_preamble(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. The state can either be default, the revision table, or
         the list table.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'no matching state in module preamble_rest.py\n' +
                         self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "preamble_div.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #3
0
 def insert_info(self):
     """
     """
     read_obj = open(self.__file, 'r')
     self.__write_obj = open(self.__write_to, 'w')
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         if line == 'mi<tg<close_____<style-table\n':
             if len(self.__list_of_styles) > 0:
                 self.__write_obj.write('mi<tg<open______<styles-in-body\n')
                 the_string = ''.join(self.__list_of_styles)
                 self.__write_obj.write(the_string)
                 self.__write_obj.write('mi<tg<close_____<styles-in-body\n')
             else:
                 # this shouldn't happen!
                 if self.__run_level > 3:
                     msg = 'Not enough data for each table\n'
                     raise self.__bug_handler, msg
                 # why was this line even here?
                 # self.__write_obj.write('mi<tg<open______<table\n')
         self.__write_obj.write(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "body_styles.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #4
0
 def insert_info(self):
     """
     """
     read_obj = open(self.__file, 'r')
     self.__write_obj = open(self.__write_to, 'w')
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         if line == 'mi<mk<tabl-start\n':
             if len(self.__table_data) > 0:
                 table_dict = self.__table_data[0]
                 self.__write_obj.write('mi<tg<open-att__<table')
                 keys = table_dict.keys()
                 for key in keys:
                     self.__write_obj.write('<%s>%s' %
                                            (key, table_dict[key]))
                 self.__write_obj.write('\n')
                 self.__table_data = self.__table_data[1:]
             else:
                 # this shouldn't happen!
                 if self.__run_level > 3:
                     msg = 'Not enough data for each table\n'
                     raise self.__bug_handler, msg
                 self.__write_obj.write('mi<tg<open______<table\n')
         elif line == 'mi<mk<table-end_\n':
             self.__write_obj.write('mi<tg<close_____<table\n')
         self.__write_obj.write(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "table_info.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #5
0
 def fix_fields(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the body, look for the
         beginning of the body.
        The other two states are toc_index (for toc and index entries) and
        bookmark.
     """
     self.__initiate_values()
     with open(self.__file, 'r') as read_obj:
         with open(self.__write_to, 'w') as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write('No matching state in module fields_small.py\n')
                     sys.stderr.write(self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler = self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "fields_small.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #6
0
 def group_styles(self):
     """
     Required:
         nothing
     Returns:
         original file will be changed
     Logic:
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         self.__get_style_name(line)
         action = self.__state_dict.get(self.__state)
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "group_styles.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #7
0
 def add_brackets(self):
     """
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'No matching state in module add_brackets.py\n'
                         '%s\n' % self.__state)
                 action(line)
     # Check bad brackets
     if self.__check_brackets(self.__write_to):
         copy_obj = copy.Copy(bug_handler=self.__bug_handler)
         if self.__copy:
             copy_obj.copy_file(self.__write_to, "add_brackets.data")
         copy_obj.rename(self.__write_to, self.__file)
     else:
         if self.__run_level > 0:
             sys.stderr.write(
                 'Sorry, but this files has a mix of old and new RTF.\n'
                 'Some characteristics cannot be converted.\n')
     os.remove(self.__write_to)
Пример #8
0
 def make_table(self):
     """
     Requires:
         nothing
     Returns:
         A dictionary of values for the beginning of the table.
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state.
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         action = self.__state_dict.get(self.__state[-1])
         # print self.__state[-1]
         if action is None:
             sys.stderr.write('No matching state in module table.py\n')
             sys.stderr.write(self.__state[-1] + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "table.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__table_data
Пример #9
0
 def make_preamble_divisions(self):
     self.__initiate_values()
     read_obj = open(self.__file, 'r')
     self.__write_obj = open(self.__write_to, 'w')
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         if self.__token_info == 'ob<nu<open-brack':
             self.__ob_count = line[-5:-1]
             self.__ob_group += 1
         if self.__token_info == 'cb<nu<clos-brack':
             self.__cb_count = line[-5:-1]
             self.__ob_group -= 1
         action = self.__state_dict.get(self.__state)
         if action is None:
             print self.__state
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "preamble_div.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__all_lists
Пример #10
0
 def convert_styles(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the style table, look for the
         beginning of the style table.
         If the state is in the style table, create the style dictionary
         and print out the tags.
         If the state if after the style table, look for lines with style
         info, and substitute the number with the name of the style.
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         action = self.__state_dict.get(self.__state)
         if action is None:
             sys.stderr.write('no matching state in module styles.py\n')
             sys.stderr.write(self.__state + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "styles.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #11
0
 def delete_info(self):
     """Main method for handling other methods. Read one line at
     a time, and determine whether to print the line based on the state."""
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 # ob<nu<open-brack<0001
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 # Get action to perform
                 action = self.__state_dict.get(self.__state)
                 if not action:
                     sys.stderr.write('No action in dictionary state is "%s" \n'
                             % self.__state)
                 # Print if allowed by action
                 if action(line):
                     self.__write_obj.write(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "delete_info.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__found_delete
Пример #12
0
 def make_paragraph_def(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state.
     """
     self.__initiate_values()
     read_obj = open(self.__file, 'r')
     self.__write_obj = open(self.__write_to, 'w')
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         action = self.__state_dict.get(self.__state)
         if action is None:
             sys.stderr.write(
                 'no no matching state in module sections.py\n')
             sys.stderr.write(self.__state + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "paragraphs_def.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__body_style_strings
Пример #13
0
 def convert_fonts(self):
     """
     Required:
         nothing
     Returns:
         a dictionary indicating with values for special fonts
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is font_table, looke for individual fonts
         and add the number and font name to a dictionary. Also create a
         tag for each individual font in the font table.
         If the state is after the font table, look for lines with font
         info. Substitute a font name for a font number.
         """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'no matching state in module fonts.py\n' +
                         self.__state + '\n')
                 action(line)
     default_font_name = self.__font_table.get(self.__default_font_num)
     if not default_font_name:
         default_font_name = 'Not Defined'
     self.__special_font_dict['default-font'] = default_font_name
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "fonts.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     return self.__special_font_dict
Пример #14
0
 def make_paragraphs(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the body, look for the
         beginning of the body.
         When the body is found, change the state to 'not_paragraph'. The
         only other state is 'paragraph'.
     """
     self.__initiate_values()
     with open(self.__file, 'r') as read_obj:
         with open(self.__write_to, 'w') as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     try:
                         sys.stderr.write(
                             'no matching state in module paragraphs.py\n')
                         sys.stderr.write(self.__state + '\n')
                     except:
                         pass
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "paragraphs.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #15
0
 def convert_colors(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the color table, look for the
         beginning of the color table.
         If the state is in the color table, create the color dictionary
         and print out the tags.
         If the state if afer the color table, look for lines with color
         info, and substitute the number with the hex number.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__line += 1
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     try:
                         sys.stderr.write(
                             'no matching state in module fonts.py\n')
                         sys.stderr.write(self.__state + '\n')
                     except:
                         pass
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "color.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #16
0
 def make_sections(self):
     """
     Required:
         nothing
     Returns:
         original file will be changed
     Logic:
     """
     self.__initiate_values()
     read_obj = open(self.__file, 'r')
     self.__write_obj = open(self.__write_to, 'w')
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         action = self.__state_dict.get(self.__state)
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler = self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "sections_to_headings.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #17
0
 def process_pict(self):
     self.__make_dir()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 if self.__token_info == 'ob<nu<open-brack':
                     self.__ob_count = line[-5:-1]
                 if self.__token_info == 'cb<nu<clos-brack':
                     self.__cb_count = line[-5:-1]
                 if not self.__in_pict:
                     to_print = self.__default(line, write_obj)
                     if to_print :
                         write_obj.write(line)
                 else:
                     to_print = self.__in_pict_func(line)
                     if to_print :
                         write_obj.write(line)
             if self.__already_found_pict:
                 self.__write_pic_obj.write("}\n")
                 self.__write_pic_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "pict.data")
         try:
             copy_obj.copy_file(self.__pict_file, "pict.rtf")
         except:
             pass
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     if self.__pict_count == 0:
         try:
             os.rmdir(self.__dir_name)
         except OSError:
             pass
Пример #18
0
 def fix_info(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the information table, look for the
         beginning of the style table.
         If the state is in the information table, use other methods to
         parse the information
         style table, look for lines with style info, and substitute the
         number with the name of the style.  If the state if afer the
         information table, simply write the line to the output file.
     """
     self.__initiate_values()
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'No matching state in module styles.py\n')
                     sys.stderr.write(self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "info.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #19
0
 def make_sections(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the body, look for the
         beginning of the body.
         If the state is body, send the line to the body method.
     """
     self.__initiate_values()
     read_obj = open_for_read(self.__file)
     self.__write_obj = open_for_write(self.__write_to)
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         action = self.__state_dict.get(self.__state)
         if action is None:
             sys.stderr.write('no matching state in module sections.py\n')
             sys.stderr.write(self.__state + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "sections.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #20
0
    def process_tokens(self):
        """Main method for handling other methods. """
        line_count = 0
        with open(self.__file, 'r') as read_obj:
            with open(self.__write_to, 'wb') as write_obj:
                for line in read_obj:
                    token = line.replace("\n", "")
                    line_count += 1
                    if line_count == 1 and token != '\\{':
                        msg = '\nInvalid RTF: document doesn\'t start with {\n'
                        raise self.__exception_handler(msg)
                    elif line_count == 2 and token[0:4] != '\\rtf':
                        msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
                        raise self.__exception_handler(msg)

                    the_index = token.find('\\ ')
                    if token is not None and the_index > -1:
                        msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
                            % line_count
                        raise self.__exception_handler(msg)
                    elif token[:1] == "\\":
                        try:
                            token.decode('us-ascii')
                        except UnicodeError as msg:
                            msg = '\nInvalid RTF: Tokens not ascii encoded.\n%s\nError at line %d'\
                                % (str(msg), line_count)
                            raise self.__exception_handler(msg)
                        line = self.process_cw(token)
                        if line is not None:
                            write_obj.write(line)
                    else:
                        fields = re.split(self.__utf_exp, token)
                        for field in fields:
                            if not field:
                                continue
                            if field[0:1] == '&':
                                write_obj.write('tx<ut<__________<%s\n' %
                                                field)
                            else:
                                write_obj.write('tx<nu<__________<%s\n' %
                                                field)

        if not line_count:
            msg = '\nInvalid RTF: file appears to be empty.\n'
            raise self.__exception_handler(msg)

        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)

        bad_brackets = self.__check_brackets(self.__file)
        if bad_brackets:
            msg = '\nInvalid RTF: document does not have matching brackets.\n'
            raise self.__exception_handler(msg)
        else:
            return self.__return_code
Пример #21
0
 def convert_to_tags(self):
     """
     Read in the file one line at a time. Get the important info, between
     [:16]. Check if this info matches a dictionary entry. If it does, call
     the appropriate function.
     The functions that are called:
         a text function for text
         an open function for open tags
         an open with attribute function for tags with attributes
         an empty with attribute function for tags that are empty but have
         attribtes.
         a closed function for closed tags.
         an empty tag function.
         """
     self.__initiate_values()
     with open(self.__write_to, 'w') as self.__write_obj:
         self.__write_dec()
         with open(self.__file, 'r') as read_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__state_dict.get(self.__token_info)
                 if action is not None:
                     action(line)
     # convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
     if self.__convert_utf or self.__bad_encoding:
         copy_obj = copy.Copy(bug_handler=self.__bug_handler)
         copy_obj.rename(self.__write_to, self.__file)
         file_encoding = "utf-8"
         if self.__bad_encoding:
             file_encoding = "us-ascii"
         with open(self.__file, 'r') as read_obj:
             with open(self.__write_to, 'w') as write_obj:
                 write_objenc = EncodedFile(write_obj, self.__encoding,
                                            file_encoding, 'replace')
                 for line in read_obj:
                     write_objenc.write(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #22
0
 def replace_illegals(self):
     """
     """
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as write_obj:
             for line in read_obj:
                 write_obj.write(clean_ascii_chars(line))
     copy_obj = copy.Copy()
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "replace_illegals.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #23
0
 def combine_borders(self):
     with open_for_read(self.__file) as read_obj:
         with open_for_write(self.__write_to) as write_obj:
             for line in read_obj:
                 self.__first_five = line[0:5]
                 if self.__state == 'border':
                     self.__border_func(line, write_obj)
                 else:
                     write_obj.write(self.__default_func(line))
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "combine_borders.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #24
0
 def __convert_body(self):
     self.__state = 'body'
     with open(self.__file, 'r') as read_obj:
         with open(self.__write_to, 'w') as self.__write_obj:
             for line in read_obj:
                 self.__token_info = line[:16]
                 action = self.__body_state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write('error no state found in hex_2_utf8',
                                      self.__state)
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #25
0
 def separate_footnotes(self):
     """
     Separate all the footnotes in an RTF file and put them at the bottom,
     where they are easier to process.  Each time a footnote is found,
     print all of its contents to a temporary file. Close both the main and
     temporary file. Print the footnotes from the temporary file to the
     bottom of the main file.
     """
     self.__initiate_sep_values()
     self.__footnote_holder = better_mktemp()
     with open(self.__file) as read_obj:
         with open(self.__write_to, 'w') as self.__write_obj:
             with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
                 for line in read_obj:
                     self.__token_info = line[:16]
                     # keep track of opening and closing brackets
                     if self.__token_info == 'ob<nu<open-brack':
                         self.__ob_count = line[-5:-1]
                     if self.__token_info == 'cb<nu<clos-brack':
                         self.__cb_count = line[-5:-1]
                     # In the middle of footnote text
                     if self.__in_footnote:
                         self.__in_footnote_func(line)
                     # not in the middle of footnote text
                     else:
                         self.__default_sep(line)
     with open(self.__footnote_holder, 'r') as read_obj:
         with open(self.__write_to, 'a') as write_obj:
             write_obj.write(
                 'mi<mk<sect-close\n'
                 'mi<mk<body-close\n'
                 'mi<tg<close_____<section\n'
                 'mi<tg<close_____<body\n'
                 'mi<tg<close_____<doc\n'
                 'mi<mk<footnt-beg\n')
             for line in read_obj:
                 write_obj.write(line)
             write_obj.write(
             'mi<mk<footnt-end\n')
     os.remove(self.__footnote_holder)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "footnote_separate.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #26
0
 def fix_endings(self):
     # read
     with open(self.__file, 'rb') as read_obj:
         input_file = read_obj.read()
     # calibre go from win and mac to unix
     input_file = input_file.replace(b'\r\n', b'\n')
     input_file = input_file.replace(b'\r', b'\n')
     # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
     if self.__replace_illegals:
         input_file = clean_ascii_chars(input_file)
     # write
     with open(self.__write_to, 'wb') as write_obj:
         write_obj.write(input_file)
     # copy
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "line_endings.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #27
0
 def form_tags(self):
     """
     Requires:
         area--area to parse (list or non-list)
     Returns:
         nothing
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state.
     """
     self.__initiate_values()
     with open(self.__file, 'r') as read_obj:
         with open(self.__write_to, 'w') as self.__write_obj:
             for line in read_obj:
                 token = line[0:-1]
                 self.__token_info = ''
                 if token == 'tx<mc<__________<rdblquote'\
                     or token == 'tx<mc<__________<ldblquote'\
                     or token == 'tx<mc<__________<lquote'\
                     or token == 'tx<mc<__________<rquote'\
                     or token == 'tx<mc<__________<emdash'\
                     or token == 'tx<mc<__________<endash'\
                     or token == 'tx<mc<__________<bullet':
                     self.__token_info = 'text'
                 else:
                     self.__token_info = line[:16]
                 self.__set_list_func(line)
                 action = self.__state_dict.get(self.__state)
                 if action is None:
                     sys.stderr.write(
                         'No matching state in module inline.py\n')
                     sys.stderr.write(self.__state + '\n')
                 action(line)
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "inline.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #28
0
 def fix_list_numbers(self):
     """
     Required:
         nothing
     Returns:
         original file will be changed
     Logic:
         Read in one line a time from the file. Keep track of opening and
         closing brackets. Determine the method ('action') by passing the
         state to the self.__state_dict.
         Simply print out the line to a temp file until an open bracket
         is found. Check the next line. If it is list-text, then start
         adding to the self.__list_chunk until the closing bracket is
         found.
         Next, look for an open bracket or text. When either is found,
         print out self.__list_chunk and the line.
     """
     self.__initiate_values()
     read_obj = open(self.__file, 'r')
     self.__write_obj = open(self.__write_to, 'w')
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         if self.__token_info == 'ob<nu<open-brack':
             self.__ob_count = line[-5:-1]
         if self.__token_info == 'cb<nu<clos-brack':
             self.__cb_count = line[-5:-1]
         action = self.__state_dict.get(self.__state)
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "list_numbers.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
Пример #29
0
 def join_headers(self):
     """
     Join the footnotes from the bottom of the file and put them in their
     former places.  First, remove the footnotes from the bottom of the
     input file, outputting them to a temporary file. This creates two new
     files, one without footnotes, and one of just footnotes. Open both
     these files to read. When a marker is found in the main file, find the
     corresponding marker in the footnote file. Output the mix of body and
     footnotes to a third file.
     """
     if not self.__found_a_header:
         return
     self.__write_to2 = better_mktemp()
     self.__state = 'body'
     self.__get_headers()
     self.__join_from_temp()
     self.__write_obj.close()
     self.__read_from_head_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "header_join.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)
     os.remove(self.__header_holder)
Пример #30
0
 def fix_fields(self):
     """
     Requires:
         nothing
     Returns:
         nothing (changes the original file)
     Logic:
         Read one line in at a time. Determine what action to take based on
         the state. If the state is before the body, look for the
         beginning of the body.
         If the state is body, send the line to the body method.
     """
     self.__initiate_values()
     read_obj = open(self.__file, 'r')
     self.__write_obj = open(self.__write_to, 'w')
     line_to_read = 1
     while line_to_read:
         line_to_read = read_obj.readline()
         line = line_to_read
         self.__token_info = line[:16]
         if self.__token_info == 'ob<nu<open-brack':
             self.__ob_count = line[-5:-1]
         if self.__token_info == 'cb<nu<clos-brack':
             self.__cb_count = line[-5:-1]
         action = self.__state_dict.get(self.__state)
         if action is None:
             sys.stderr.write('no no matching state in module styles.py\n')
             sys.stderr.write(self.__state + '\n')
         action(line)
     read_obj.close()
     self.__write_obj.close()
     copy_obj = copy.Copy(bug_handler=self.__bug_handler)
     if self.__copy:
         copy_obj.copy_file(self.__write_to, "fields_large.data")
     copy_obj.rename(self.__write_to, self.__file)
     os.remove(self.__write_to)