def save_file(self, text=None): self.log.info(f"Saving '{self.name}' from '{self.url}'") self.original_path = f"{DOCS_TO_PROCESS_PATH}{self.extension}/{self.name}" self.txt_path = f"{DOCS_TO_PROCESS_PATH}{self.name}.txt" create_file(self.original_path) with open(self.original_path, 'wb') as f: f.write(self.request.content)
def file_to_text(self): provisory_file_name = f"/tmp/processor-provisory/{self.name}.txt" create_file(provisory_file_name) bashCommand = f"pdftotext -layout {self.original_path} {provisory_file_name}" os.system(bashCommand) with open(provisory_file_name, 'r') as f: return f.read() return ''
def __init__(self, path=None): if path: self.logger_path = path create_file(self.logger_path) logging.basicConfig( filename=self.logger_path, format='[%(levelname)s]: %(asctime)s - %(message)s', level=logging.INFO)
def save_text(filename, text, probability): """ if the probability for the most probable language is higher than 40% then save the file to that language folder """ if probability.items(): most_probable = max(probability.items(), key=lambda k: k[1]) language_slug = most_probable[0] if probability_criteria_check(most_probable[1]): file_path = f"{PROBABLE_DOCS_PER_LANGUAGE_PATH}{language_slug}/{filename}" create_file(file_path) with open(file_path, 'w') as f: log.info(f"Saving text in '{language_slug}' probable docs folder as '{filename}'") f.write(text) return 1 return 0
def process_language_bootstrap_files(language): base_path = f"{BOOTSTRAP_PATH}{language['slug']}/" files = glob.glob(os.path.join(base_path, '*.txt')) log.info(f"Boostraping '{language['slug']}' language with {files}") if files: for filename in files: with open(filename, 'r') as f: log.info( f"Adding '{os.path.basename(filename)}' words to '{language['slug']}' language assets" ) add_to_language_assets(language['slug'], f.read()) folder_path, name = os.path.split(filename) new_path = f"{folder_path}/processed/{name}" create_file(new_path) os.rename(filename, new_path) else: log.warning( f"There is no files to add to '{language['slug']}' language assets" )
def save_text_file(self, text): self.log.info(f"Converting '{self.name}' to '{self.name}.txt'") create_file(self.txt_path) with open(self.txt_path, 'w') as f: text = f"{self.url}\n\n\n\n{text}" f.write(text)
def _create_language_assets(self, path): create_folder(path) create_file(f"{path}dictionary.json", '{}') create_file(f"{path}words_list")
def mark_file_as_processed(filename): log.info(f"Marking '{os.path.basename(filename)}' as processed") new_path = f"{DOCS_PROCESSED_PATH}{os.path.basename(filename)}" create_file(new_path) os.rename(filename, new_path)