def __init__(self, fpath, lang, regex, any_ext=False): """ :param fpath: :param lang: :param regex: :param any_ext: """ # Check lang input if lang not in LANGUAGE_EXTENSIONS: raise ValueError('Unrecognized input language "{:s}"'.format(lang)) # Check file extension if not any_ext: ext = getext(fpath) if ext not in LANGUAGE_EXTENSIONS[lang]: raise FileExistsError('File extension is not supported') # Check file content self.content = readfile(fpath) if self.content.strip() == '': raise FileExistsError('File is empty') # Define self properties self.lang = lang.lower() self.regex = regex # Parse file self.nodes = OrderedDict() self.founds = find_matches(self.content, self.regex['main']) if self.founds: self.parse()
def parse(fpath): """ Routes what parser to use depending on the file extension of a file :param fpath: file path of file :return: none """ # Check existence if not isfile(fpath): raise FileNotFoundError('File is not existed') # Get file extension ext = getext(fpath) # Get module parser = None for lang, exts in LANGUAGE_EXTENSIONS.items(): if ext in exts: parser = getattr(sys.modules[__name__], lang) break # If extension is not found if parser is None: raise FileExistsError('File extension is not supported') # Parse and return return parser(fpath)
def __parse(self): assert isxml(self.xml_file) with open(self.xml_file, 'rb') as fp: tree = ElementTree.parse(fp) pages = tree.findall('page') for page in pages: self.pages.append( TXTSEP.join(getext(text).strip() for text in page.findall('text')).strip() ) rmfile(self.xml_file)
def update_gui(self): """ Update label text, with info about paths and file size and number. """ text = self.lng['label_file'] + str(self.set.set_file) + '\n' + \ self.lng['label_dir'] + str(self.set.set_dir) + '\n' + \ self.lng['label_dirout'] + str(self.set.set_dirout) # A directory is selected, so calculate the size # and the number of the files it contains. Skip # the calculation if user selected the same folder. if self.set.set_file != self.set.NOP: ext = helpers.getext(self.set.set_file) # See if the extension is described in the language file. try: ext = self.lng['ext_%s' % ext] except: pass text = text + '\n' + self.lng['label_type'] + ext elif self.set.set_dir != self.set.NOP: # Calculate file number and size: # TODO: Add an option to override this. #if self.set.set_dir != self.set.previous_folder: if 1: # overrided for time being # Inform user that this migh take time: self.master.update_status('label_pleasewait', 0) # Pass the extension list, depending on the # conversion mode: if self.set.set_convmode == 'tolat': extensions = self.set.set_extensions self.master.tocyr.EXT = self.set.extensions else: extensions = self.set.set_extensions_tocyr self.master.tocyr.EXT = self.set.extensions_tocyr # Calculate self.master.tocyr.RECURSIVE = self.set.set_recursive self.filecount, self.filesize = \ self.master.tocyr.calculatedirsize(self.set.set_dir) self.set.previous_folder = self.set.set_dir if not self.filecount: self.master.update_status('label_nosupportedfiles', 0) else: self.master.update_status('label_ok', 0) # The number of files text = text + '\n' + self.lng['label_number'] % self.filecount # The size of file(s) text = text + '\n' + self.lng['label_size'] % \ '%0.2f' % self.filesize # The list of recognised extensions text = text + '\n' + self.lng['options_extensions'] + \ extensions.replace(",", ", ") # Conversion mode text = text + '\n' + self.lng['label_conv%s' % \ self.set.set_convmode] self.label_selection.configure(text=text, justify='left')
def check_allowed_extensions(self): """ Check if file extension is allowed in the conversion mode. """ # If file is slected, but the converson mode cannot be # applied to it, reset the path. file_selected = self.set.set_file != self.set.NOP if file_selected and (self.set.set_convmode == 'tocyr'): if helpers.getext(self.set.set_file) not in \ self.tocyr.EXTENSIONS: self.set.set_file = self.set.NOP messagebox.showwarning('', self.lng['msg_extension_not_supported'])