def getParser(self, fileHash, folder=None, original=False): """ returns a parser and stores cleaned file if not already available """ if folder: file_path = os.path.join(folder, fileHash) if os.path.exists(file_path): if folder in self.cleaned_folder.values(): return YACParser(filename=file_path, skip_guess_encoding=True) else: return YACParser(filename=file_path) else: if not original: for f in self.cleaned_folder: cleaned_path = os.path.join(self.cleaned_folder[f], fileHash) if os.path.exists(cleaned_path): return YACParser(filename=cleaned_path, skip_guess_encoding=True) for f in self.submit_folder: submit_path = os.path.join(self.submit_folder[f], fileHash) cleaned_path = os.path.join(self.cleaned_folder[f], fileHash) if os.path.exists(submit_path): table = YACParser(filename=submit_path) if not os.path.exists(cleaned_path): cleaned = table.generate() storeContent(cleaned, cleaned_path, md5=fileHash) return table return None
def submit(self, file=None, url=None, content=None, toFolder=None): """ 1) retrieve and compute hash of original content 2) store submitted content using hash as filename IFF not exist optional URL as symlink :param file: :param url: :param content: :param toFolder: :return: the md5 of the original file """ if toFolder in self.submit_folder and toFolder in self.cleaned_folder: s_folder = self.submit_folder[toFolder] c_folder = self.cleaned_folder[toFolder] else: return None if file: md5 = storeFile(file, s_folder) elif url: md5 = storeURL(url, s_folder, max_file_size=self.max_file_size) elif content: md5 = storeContent(content, s_folder) else: return None # check if cleaned exists submitted_path=os.path.join(s_folder, md5) cleaned_path = os.path.join(c_folder, md5) # at first look for stored cleaned version if os.path.exists(cleaned_path): return md5 else: # generate and store cleaned version table = YACParser(filename=submitted_path) cleaned = table.generate() storeContent(cleaned, c_folder, md5=md5) return md5