def fix_fields(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. If the state is before the body, look for the beginning of the body. The other two states are toc_index (for toc and index entries) and bookmark. """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write( 'No matching state in module fields_small.py\n') sys.stderr.write(self.__state + '\n') action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "fields_small.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def process_pict(self): self.__make_dir() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as write_obj: for line in read_obj: self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] if not self.__in_pict: to_print = self.__default(line, write_obj) if to_print: write_obj.write(line) else: to_print = self.__in_pict_func(line) if to_print: write_obj.write(line) if self.__already_found_pict: self.__write_pic_obj.write("}\n") self.__write_pic_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "pict.data") try: copy_obj.copy_file(self.__pict_file, "pict.rtf") except: pass copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) if self.__pict_count == 0: try: os.rmdir(self.__dir_name) except OSError: pass
def tokenize(self): """Main class for handling other methods. Reads the file \ , uses method self.sub_reg to make basic substitutions,\ and process tokens by itself""" # read with open_for_read(self.__file) as read_obj: input_file = read_obj.read() # process simple replacements and split giving us a correct list # remove '' and \n in the process tokens = self.__sub_reg_split(input_file) # correct unicode tokens = map(self.__unicode_process, tokens) # remove empty items created by removing \uc tokens = list(filter(lambda x: len(x) > 0, tokens)) # write with open_for_write(self.__write_to) as write_obj: write_obj.write('\n'.join(tokens)) # Move and copy copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "tokenize.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def convert_fonts(self): """ Required: nothing Returns: a dictionary indicating with values for special fonts Logic: Read one line in at a time. Determine what action to take based on the state. If the state is font_table, looke for individual fonts and add the number and font name to a dictionary. Also create a tag for each individual font in the font table. If the state is after the font table, look for lines with font info. Substitute a font name for a font number. """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write( 'no matching state in module fonts.py\n' + self.__state + '\n') action(line) default_font_name = self.__font_table.get(self.__default_font_num) if not default_font_name: default_font_name = 'Not Defined' self.__special_font_dict['default-font'] = default_font_name copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "fonts.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) return self.__special_font_dict
def make_paragraph_def(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. """ self.__initiate_values() read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write('no no matching state in module sections.py\n') sys.stderr.write(self.__state + '\n') action(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "paragraphs_def.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) return self.__body_style_strings
def fix_preamble(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. The state can either be defaut, the revision table, or the list table. """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write( 'no matching state in module preamble_rest.py\n' + self.__state + '\n') action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "preamble_div.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def add_brackets(self): """ """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write( 'No matching state in module add_brackets.py\n' '%s\n' % self.__state) action(line) # Check bad brackets if self.__check_brackets(self.__write_to): copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "add_brackets.data") copy_obj.rename(self.__write_to, self.__file) else: if self.__run_level > 0: sys.stderr.write( 'Sorry, but this files has a mix of old and new RTF.\n' 'Some characteristics cannot be converted.\n') os.remove(self.__write_to)
def insert_info(self): """ """ read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read if line == 'mi<mk<tabl-start\n': if len(self.__table_data) > 0: table_dict = self.__table_data[0] self.__write_obj.write('mi<tg<open-att__<table') keys = table_dict.keys() for key in keys: self.__write_obj.write('<%s>%s' % (key, table_dict[key])) self.__write_obj.write('\n') self.__table_data = self.__table_data[1:] else: # this shouldn't happen! if self.__run_level > 3: msg = 'Not enough data for each table\n' raise self.__bug_handler(msg) self.__write_obj.write('mi<tg<open______<table\n') elif line == 'mi<mk<table-end_\n': self.__write_obj.write('mi<tg<close_____<table\n') self.__write_obj.write(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "table_info.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def make_preamble_divisions(self): self.__initiate_values() read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] self.__ob_group += 1 if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] self.__ob_group -= 1 action = self.__state_dict.get(self.__state) if action is None: print(self.__state) action(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "preamble_div.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) return self.__all_lists
def make_paragraphs(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. If the state is before the body, look for the beginning of the body. When the body is found, change the state to 'not_paragraph'. The only other state is 'paragraph'. """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action is None: try: sys.stderr.write('no matching state in module paragraphs.py\n') sys.stderr.write(self.__state + '\n') except: pass action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "paragraphs.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def group_styles(self): """ Required: nothing Returns: original file will be changed Logic: """ self.__initiate_values() read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read self.__token_info = line[:16] self.__get_style_name(line) action = self.__state_dict.get(self.__state) action(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "group_styles.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def insert_info(self): """ """ read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read if line == 'mi<tg<close_____<style-table\n': if len(self.__list_of_styles) > 0: self.__write_obj.write('mi<tg<open______<styles-in-body\n') the_string = ''.join(self.__list_of_styles) self.__write_obj.write(the_string) self.__write_obj.write('mi<tg<close_____<styles-in-body\n') else: # this shouldn't happen! if self.__run_level > 3: msg = 'Not enough data for each table\n' raise self.__bug_handler(msg) # why was this line even here? # self.__write_obj.write('mi<tg<open______<table\n') self.__write_obj.write(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "body_styles.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def convert_styles(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. If the state is before the style table, look for the beginning of the style table. If the state is in the style table, create the style dictionary and print out the tags. If the state if afer the style table, look for lines with style info, and substitute the number with the name of the style. """ self.__initiate_values() read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write('no matching state in module styles.py\n') sys.stderr.write(self.__state + '\n') action(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "styles.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def convert_colors(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. If the state is before the color table, look for the beginning of the color table. If the state is in the color table, create the color dictionary and print out the tags. If the state if afer the color table, look for lines with color info, and substitute the number with the hex number. """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__line+=1 self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action is None: try: sys.stderr.write('no matching state in module fonts.py\n') sys.stderr.write(self.__state + '\n') except: pass action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "color.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def fix_info(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. If the state is before the information table, look for the beginning of the style table. If the state is in the information table, use other methods to parse the information style table, look for lines with style info, and substitute the number with the name of the style. If the state if afer the information table, simply write the line to the output file. """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__token_info = line[:16] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write( 'No matching state in module styles.py\n') sys.stderr.write(self.__state + '\n') action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "info.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def delete_info(self): """Main method for handling other methods. Read one line at a time, and determine whether to print the line based on the state.""" with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: # ob<nu<open-brack<0001 self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] # Get action to perform action = self.__state_dict.get(self.__state) if not action: sys.stderr.write( 'No action in dictionary state is "%s" \n' % self.__state) # Print if allowed by action if action(line): self.__write_obj.write(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "delete_info.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) return self.__found_delete
def process_tokens(self): """Main method for handling other methods. """ line_count = 0 with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as write_obj: for line in read_obj: token = line.replace("\n", "") line_count += 1 if line_count == 1 and token != '\\{': msg = '\nInvalid RTF: document doesn\'t start with {\n' raise self.__exception_handler(msg) elif line_count == 2 and token[0:4] != '\\rtf': msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n' raise self.__exception_handler(msg) the_index = token.find('\\ ') if token is not None and the_index > -1: msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ % line_count raise self.__exception_handler(msg) elif token[:1] == "\\": line = self.process_cw(token) if line is not None: write_obj.write(line) else: fields = re.split(self.__utf_exp, token) for field in fields: if not field: continue if field[0:1] == '&': write_obj.write('tx<ut<__________<%s\n' % field) else: write_obj.write('tx<nu<__________<%s\n' % field) if not line_count: msg = '\nInvalid RTF: file appears to be empty.\n' raise self.__exception_handler(msg) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "processed_tokens.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) bad_brackets = self.__check_brackets(self.__file) if bad_brackets: msg = '\nInvalid RTF: document does not have matching brackets.\n' raise self.__exception_handler(msg) else: return self.__return_code
def combine_borders(self): with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as write_obj: for line in read_obj: self.__first_five = line[0:5] if self.__state == 'border': self.__border_func(line, write_obj) else: write_obj.write(self.__default_func(line)) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "combine_borders.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def convert_to_tags(self): """ Read in the file one line at a time. Get the important info, between [:16]. Check if this info matches a dictionary entry. If it does, call the appropriate function. The functions that are called: a text function for text an open function for open tags an open with attribute function for tags with attributes an empty with attribute function for tags that are empty but have attribtes. a closed function for closed tags. an empty tag function. """ self.__initiate_values() with open_for_write(self.__write_to) as self.__write_obj: self.__write_dec() with open_for_read(self.__file) as read_obj: for line in read_obj: self.__token_info = line[:16] action = self.__state_dict.get(self.__token_info) if action is not None: action(line) # convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml if self.__convert_utf or self.__bad_encoding: copy_obj = copy.Copy(bug_handler=self.__bug_handler) copy_obj.rename(self.__write_to, self.__file) with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as write_obj: for line in read_obj: write_obj.write(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "convert_to_tags.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def __convert_body(self): self.__state = 'body' with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: self.__token_info = line[:16] action = self.__body_state_dict.get(self.__state) if action is None: sys.stderr.write('error no state found in hex_2_utf8', self.__state) action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "body_utf_convert.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def separate_footnotes(self): """ Separate all the footnotes in an RTF file and put them at the bottom, where they are easier to process. Each time a footnote is found, print all of its contents to a temporary file. Close both the main and temporary file. Print the footnotes from the temporary file to the bottom of the main file. """ self.__initiate_sep_values() self.__footnote_holder = better_mktemp() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: with open_for_write( self.__footnote_holder) as self.__write_to_foot_obj: for line in read_obj: self.__token_info = line[:16] # keep track of opening and closing brackets if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] # In the middle of footnote text if self.__in_footnote: self.__in_footnote_func(line) # not in the middle of footnote text else: self.__default_sep(line) with open_for_read(self.__footnote_holder) as read_obj: with open_for_write(self.__write_to, append=True) as write_obj: write_obj.write('mi<mk<sect-close\n' 'mi<mk<body-close\n' 'mi<tg<close_____<section\n' 'mi<tg<close_____<body\n' 'mi<tg<close_____<doc\n' 'mi<mk<footnt-beg\n') for line in read_obj: write_obj.write(line) write_obj.write('mi<mk<footnt-end\n') os.remove(self.__footnote_holder) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "footnote_separate.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def fix_endings(self): # read with open(self.__file, 'rb') as read_obj: input_file = read_obj.read() # calibre go from win and mac to unix input_file = input_file.replace(b'\r\n', b'\n') input_file = input_file.replace(b'\r', b'\n') # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 if self.__replace_illegals: input_file = clean_ascii_chars(input_file) # write with open(self.__write_to, 'wb') as write_obj: write_obj.write(input_file) # copy copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "line_endings.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def fix_list_numbers(self): """ Required: nothing Returns: original file will be changed Logic: Read in one line a time from the file. Keep track of opening and closing brackets. Determine the method ('action') by passing the state to the self.__state_dict. Simply print out the line to a temp file until an open bracket is found. Check the next line. If it is list-text, then start adding to the self.__list_chunk until the closing bracket is found. Next, look for an open bracket or text. When either is found, print out self.__list_chunk and the line. """ self.__initiate_values() read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] action = self.__state_dict.get(self.__state) action(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "list_numbers.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def form_tags(self): """ Requires: area--area to parse (list or non-list) Returns: nothing Logic: Read one line in at a time. Determine what action to take based on the state. """ self.__initiate_values() with open_for_read(self.__file) as read_obj: with open_for_write(self.__write_to) as self.__write_obj: for line in read_obj: token = line[0:-1] self.__token_info = '' if token == 'tx<mc<__________<rdblquote'\ or token == 'tx<mc<__________<ldblquote'\ or token == 'tx<mc<__________<lquote'\ or token == 'tx<mc<__________<rquote'\ or token == 'tx<mc<__________<emdash'\ or token == 'tx<mc<__________<endash'\ or token == 'tx<mc<__________<bullet': self.__token_info = 'text' else: self.__token_info = line[:16] self.__set_list_func(line) action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write( 'No matching state in module inline.py\n') sys.stderr.write(self.__state + '\n') action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "inline.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def join_headers(self): """ Join the footnotes from the bottom of the file and put them in their former places. First, remove the footnotes from the bottom of the input file, outputting them to a temporary file. This creates two new files, one without footnotes, and one of just footnotes. Open both these files to read. When a marker is found in the main file, find the corresponding marker in the footnote file. Output the mix of body and footnotes to a third file. """ if not self.__found_a_header: return self.__write_to2 = better_mktemp() self.__state = 'body' self.__get_headers() self.__join_from_temp() self.__write_obj.close() self.__read_from_head_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "header_join.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) os.remove(self.__header_holder)
def fix_fields(self): """ Requires: nothing Returns: nothing (changes the original file) Logic: Read one line in at a time. Determine what action to take based on the state. If the state is before the body, look for the beginning of the body. If the state is body, send the line to the body method. """ self.__initiate_values() read_obj = open_for_read(self.__file) self.__write_obj = open_for_write(self.__write_to) line_to_read = 1 while line_to_read: line_to_read = read_obj.readline() line = line_to_read self.__token_info = line[:16] if self.__token_info == 'ob<nu<open-brack': self.__ob_count = line[-5:-1] if self.__token_info == 'cb<nu<clos-brack': self.__cb_count = line[-5:-1] action = self.__state_dict.get(self.__state) if action is None: sys.stderr.write('no no matching state in module styles.py\n') sys.stderr.write(self.__state + '\n') action(line) read_obj.close() self.__write_obj.close() copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "fields_large.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to)
def parse_rtf(self): """ Parse the file by calling on other classes. Requires: Nothing Returns: A parsed file in XML, either to standard output or to a file, depending on the value of 'output' when the instance was created. """ self.__temp_file = self.__make_temp_file(self.__file) # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory if self.__debug_dir: copy_obj = copy.Copy(bug_handler=RtfInvalidCodeException, ) copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets( file=self.__temp_file, bug_handler=RtfInvalidCodeException, ) # convert Macintosh and Windows line endings to Unix line endings # why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, replace_illegals=self.__replace_illegals, ) return_value = line_obj.fix_endings() # calibre return what? self.__return_code(return_value) tokenize_obj = tokenize.Tokenize(bug_handler=RtfInvalidCodeException, in_file=self.__temp_file, copy=self.__copy, run_level=self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, exception_handler=InvalidRtfException, ) try: return_value = process_tokens_obj.process_tokens() except InvalidRtfException as msg: # Check to see if the file is correctly encoded encode_obj = default_encoding.DefaultEncoding( in_file=self.__temp_file, run_level=self.__run_level, bug_handler=RtfInvalidCodeException, check_raw=True, default_encoding=self.__default_encoding, ) platform, code_page, default_font_num = encode_obj.find_default_encoding( ) check_encoding_obj = check_encoding.CheckEncoding( bug_handler=RtfInvalidCodeException, ) enc = encode_obj.get_codepage() # TODO: to check if cp is a good idea or if I should use a dict to convert enc = 'cp' + enc msg = '%s\nException in token processing' % str(msg) if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, bytes) \ else self.__file.encode('utf-8') msg += '\nFile %s does not appear to be correctly encoded.\n' % file_name try: os.remove(self.__temp_file) except OSError: pass raise InvalidRtfException(msg) delete_info_obj = delete_info.DeleteInfo( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level, ) # found destination means {\*\destination # if found, the RTF should be newer RTF found_destination = delete_info_obj.delete_info() self.__bracket_match('delete_data_info') # put picts in a separate file pict_obj = pict.Pict( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, orig_file=self.__file, out_file=self.__out_file, run_level=self.__run_level, ) pict_obj.process_pict() self.__bracket_match('pict_data_info') combine_obj = combine_borders.CombineBorders( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) combine_obj.combine_borders() self.__bracket_match('combine_borders_info') footnote_obj = footnote.Footnote( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) footnote_obj.separate_footnotes() self.__bracket_match('separate_footnotes_info') header_obj = header.Header( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) header_obj.separate_headers() self.__bracket_match('separate_headers_info') list_numbers_obj = list_numbers.ListNumbers( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) list_numbers_obj.fix_list_numbers() self.__bracket_match('list_number_info') preamble_div_obj = preamble_div.PreambleDiv( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) list_of_lists = preamble_div_obj.make_preamble_divisions() self.__bracket_match('make_preamble_divisions') encode_obj = default_encoding.DefaultEncoding( in_file=self.__temp_file, run_level=self.__run_level, bug_handler=RtfInvalidCodeException, default_encoding=self.__default_encoding, ) platform, code_page, default_font_num = encode_obj.find_default_encoding( ) hex2utf_obj = hex_2_utf8.Hex2Utf8( in_file=self.__temp_file, copy=self.__copy, area_to_convert='preamble', char_file=self.__char_data, default_char_map=code_page, run_level=self.__run_level, bug_handler=RtfInvalidCodeException, invalid_rtf_handler=InvalidRtfException, ) hex2utf_obj.convert_hex_2_utf8() self.__bracket_match('hex_2_utf_preamble') fonts_obj = fonts.Fonts( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, default_font_num=default_font_num, run_level=self.__run_level, ) special_font_dict = fonts_obj.convert_fonts() self.__bracket_match('fonts_info') color_obj = colors.Colors( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level, ) color_obj.convert_colors() self.__bracket_match('colors_info') style_obj = styles.Styles( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) style_obj.convert_styles() self.__bracket_match('styles_info') info_obj = info.Info( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) info_obj.fix_info() default_font = special_font_dict.get('default-font') preamble_rest_obj = preamble_rest.Preamble( file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, platform=platform, default_font=default_font, code_page=code_page) preamble_rest_obj.fix_preamble() self.__bracket_match('preamble_rest_info') old_rtf_obj = OldRtf( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, run_level=self.__run_level, ) # RTF can actually have destination groups and old RTF. # BAH! old_rtf = old_rtf_obj.check_if_old_rtf() if old_rtf: if self.__run_level > 5: msg = 'Older RTF\n' \ 'self.__run_level is "%s"\n' % self.__run_level raise RtfInvalidCodeException(msg) if self.__run_level > 1: sys.stderr.write('File could be older RTF...\n') if found_destination: if self.__run_level > 1: sys.stderr.write('File also has newer RTF.\n' 'Will do the best to convert...\n') add_brackets_obj = add_brackets.AddBrackets( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) add_brackets_obj.add_brackets() fields_small_obj = fields_small.FieldsSmall( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level, ) fields_small_obj.fix_fields() self.__bracket_match('fix_small_fields_info') fields_large_obj = fields_large.FieldsLarge( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level) fields_large_obj.fix_fields() self.__bracket_match('fix_large_fields_info') sections_obj = sections.Sections( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) sections_obj.make_sections() self.__bracket_match('sections_info') paragraphs_obj = paragraphs.Paragraphs( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, write_empty_para=self.__empty_paragraphs, run_level=self.__run_level, ) paragraphs_obj.make_paragraphs() self.__bracket_match('paragraphs_info') default_font = special_font_dict['default-font'] paragraph_def_obj = paragraph_def.ParagraphDef( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, default_font=default_font, run_level=self.__run_level, ) list_of_styles = paragraph_def_obj.make_paragraph_def() body_styles_obj = body_styles.BodyStyles( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, list_of_styles=list_of_styles, run_level=self.__run_level, ) body_styles_obj.insert_info() self.__bracket_match('body_styles_info') self.__bracket_match('paragraph_def_info') table_obj = table.Table( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) table_data = table_obj.make_table() self.__bracket_match('table_info') table_info_obj = table_info.TableInfo( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, table_data=table_data, run_level=self.__run_level, ) table_info_obj.insert_info() self.__bracket_match('table__data_info') if self.__form_lists: make_list_obj = make_lists.MakeLists( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, headings_to_sections=self.__headings_to_sections, run_level=self.__run_level, list_of_lists=list_of_lists, ) make_list_obj.make_lists() self.__bracket_match('form_lists_info') if self.__headings_to_sections: headings_to_sections_obj = headings_to_sections.HeadingsToSections( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) headings_to_sections_obj.make_sections() self.__bracket_match('headings_to_sections_info') if self.__group_styles: group_styles_obj = group_styles.GroupStyles( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, wrap=1, run_level=self.__run_level, ) group_styles_obj.group_styles() self.__bracket_match('group_styles_info') if self.__group_borders: group_borders_obj = group_borders.GroupBorders( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, wrap=1, run_level=self.__run_level, ) group_borders_obj.group_borders() self.__bracket_match('group_borders_info') inline_obj = inline.Inline( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) inline_obj.form_tags() self.__bracket_match('inline_info') hex2utf_obj.update_values( file=self.__temp_file, area_to_convert='body', copy=self.__copy, char_file=self.__char_data, convert_caps=self.__convert_caps, convert_symbol=self.__convert_symbol, convert_wingdings=self.__convert_wingdings, convert_zapf=self.__convert_zapf, symbol=1, wingdings=1, dingbats=1, ) hex2utf_obj.convert_hex_2_utf8() header_obj.join_headers() footnote_obj.join_footnotes() tags_obj = convert_to_tags.ConvertToTags( in_file=self.__temp_file, copy=self.__copy, dtd_path=self.__dtd_path, indent=self.__indent, run_level=self.__run_level, no_dtd=self.__no_dtd, encoding=encode_obj.get_codepage(), bug_handler=RtfInvalidCodeException, ) tags_obj.convert_to_tags() output_obj = output.Output( file=self.__temp_file, orig_file=self.__file, output_dir=self.__out_dir, out_file=self.__out_file, ) output_obj.output() os.remove(self.__temp_file) return self.__exit_level