class TXTDenoiser(Command): """Command to clean TXT files """ def __init__(self, filename, logger, config): super(TXTDenoiser, self).__init__(filename, logger, config) self.denoiser = Denoiser(config) self.logger.debug("Denoiser initialized") def execute(self): """Execute the command """ try: self.logger.debug("::: Text cleaning :::") # super(TXTDenoiser, self).get_file() txt_dir = join(self.unzipped, "txt") txt_files = [ join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt") ] if len(txt_files) != 1: self.logger.error("Incorrect number of text files") self.finalize() return -1 text_data = self.denoiser.cleanse(txt_files[0], False) # Writing classified lines base_filename = splitext(basename(txt_files[0]))[0] clean_filename = join(txt_dir, base_filename + ".clean.txt") garbage_filename = join(txt_dir, base_filename + ".grbge.txt") unclassified_filename = join(txt_dir, base_filename + ".unclss.txt") with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file: for line in text_data.get_clean_lines(): clean_file.write(line + "\n") with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file: for line in text_data.get_garbage_lines(): garbage_file.write(line + "\n") if len(text_data.get_unclassified_lines()) > 0: with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file: for line in text_data.get_unclassified_lines(): unclassified_file.write(line + "\n") except Exception, e: print e self.logger.error("Cleaner has stopped unexpectedly: " + e.message) self.finalize() return -2 self.finalize() return 0
class TXTDenoiser(Command): """Command to clean TXT files """ def __init__(self, filename, logger, config): super(TXTDenoiser, self).__init__(filename, logger, config) self.denoiser = Denoiser(config) def execute(self): """Execute the command """ try: self.logger.debug("::: Text cleaning :::") super(TXTDenoiser, self).get_file() txt_dir = join(self.unzipped, "txt") txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")] if len(txt_files) != 1: self.logger.error("Incorrect number of text files") self.finalize() return -1 text_data = self.denoiser.cleanse(txt_files[0], False) # Writing classified lines base_filename = splitext(basename(txt_files[0]))[0] clean_filename = join(txt_dir, base_filename+".clean.txt") garbage_filename = join(txt_dir, base_filename+".grbge.txt") unclassified_filename = join(txt_dir, base_filename+".unclss.txt") with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file: for line in text_data.get_clean_lines(): clean_file.write(line+"\n") with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file: for line in text_data.get_garbage_lines(): garbage_file.write(line+"\n") if len(text_data.get_unclassified_lines()) > 0: with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file: for line in text_data.get_unclassified_lines(): unclassified_file.write(line+"\n") except Exception, e: print e self.logger.error("Cleaner has stopped unexpectedly: "+e.message) self.finalize() return -2 self.finalize() return 0