def walk_database_query_picking_files(self): for n in self.query: item = Item(join(self.root, n.accession, n.filepath), self.root) if getattr(n,'checksum',None): item.remote_hash = n.checksum if getattr(n,'size',None): item.remote_size = n.size if getattr(n,'mimetype',None): item.remote_mimetype = n.mimetype yield item
def copy_into_new_location(self): self.copy_source_directory_tree_to_destination() assert exists(dirname(self.destination)) shutil.copyfile(self.filepath, self.destination) if exists(self.destination): new = Item(self.destination) self.destination_ownership(self.owner) self.destination(self.group) if new.find_sha256() == self.get_sha256(): return namedtuple("result","status message") \ ("Good","") else: return namedtuple("result","status message") \ ("Bad","source checksum and destination checksum mismatch") else: return namedtuple("result","status message") \ ("Bad","destination could not be created")
def createMetaAccession(self): metaAccessionDict = {} for accession in self.accessions: for item in accession.get_items(): # Clobber entries from previous accessions, keep the newest metaAccessionDict[item.get_canonical_filepath()] = \ item.get_root_path() metaAccessionList = [] for canonicalFilepath in metaAccessionDict: metaAccessionList.append( metaAccessionDict[canonicalFilepath] + "/" + canonicalFilepath) metaAccession = Batch() for reconstructedFilePath in metaAccessionList: metaAccession.add_item(Item(reconstructedFilePath)) return metaAccession
def __init__(self,path,root): Item.__init__(self,path,root)
def main(): try: parser = ArgumentParser(description="",epilog="") parser.add_argument("record_number",help="",action="store") parser.add_argument("file_list",help="",action="store") args = parser.parse_args() g = Grouping() try: files_list = open(args.file_list, 'r').readlines() except: stderr.write("cold not open file {flist}". \ format(flist = args.file_list)) return 1 for n_file in files_list: fullpath_dir = dirname(n_file) relpath_dir = relpath(fullpath_dir, '/media/repo/repository/tr/') accession = relpath_dir.split('/')[0] i = AccessionDirectory(join('/media/repo/repository/tr/', accession), '/media/repo/repository/tr/') a = Item(n_file) setattr(a,'accession',accession) the_file_path = a.get_file_path().strip() canonical_file_path = the_file_path.split(accession)[1].lstrip('/') pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})') matches = pattern.search(canonical_file_path) if matches: identifier = '-'.join([matches.group(1), matches.group(2), matches.group(3), matches.group(4)]) files_required = generate_pattern_list(identifier) n = g.does_it_need_a_new_item(identifier) for tup in files_required: pattern = re_compile(tup[1]) label = tup[0] search = pattern.search(a.get_file_path()) if search: check = True is_page = False for ppart in page_num_file_parts: page_pattern = re_compile(ppart) page_search = page_pattern. \ search(a.get_file_path()) if page_search: is_page = True n.add_page(a, label, page_search.group(1), splitext(basename( \ a.get_file_path()))[0]) if not is_page: n.add_representation(a, label) g.sort_items() else: logging.error("could not match file {filename}". \ format(filename = the_file_path)) mvol_csv = "mvol-%s.csv" % args.record_number main_csv = Output(mvol_csv) main_csv.add_record(csv_headers) for n in g.items: if 'Thum' in n.identifier or 'tif' in n.identifier \ or 'test' in n.identifier: continue id_parts = n.identifier.split('-') has_new_struct = 'N' has_old_struct = 'N' has_pdf = 'N' has_pages = 'Y' if getattr(n,'pages',None) else 'N' struct_objids = [] txt_objids = [] old_style = False new_style = False pdf = getattr(n, 'pdf', None) new_mdata = getattr(n, 'struct', None) old_mdata = getattr(n, 'txt', None) if pdf: has_pdf = 'Y' else: errors.get(n.identifier).append("missing {rep}". \ format(rep = rep)) if new_mdata and isinstance(new_mdata, list): has_new_struct = 'Y' struct_mdata_files = [x.get_file_path(). \ strip() for x in new_mdata] elif new_mdata and isinstance(new_mdata, Item): has_new_struct = 'Y' struct_mdata_files = [new_mdata.get_file_path().strip()] elif old_mdata and isinstance(old_mdata, list): has_old_struct = 'Y' struct_mdata_files = [x.get_file_path(). \ strip() for x in old_mdata] elif old_mdata and isinstance(old_mdata, Item): has_old_struct = 'Y' struct_mdata_files = [old_mdata.get_file_path().strip()] if struct_mdata_files: for struct_mdata_file in struct_mdata_files: try: fp = open(struct_mdata_file,'r') lines = fp.readlines() lines = [x for x in lines if len(x.split('\t')) == 3] relevant_lines = lines[1:] objids = [x.replace('\t','').strip('\n') \ for x in relevant_lines] page_difference = len(objids) - len(n.pages) if has_new_struct == 'Y': struct_objids = objids elif has_old_struct == 'Y': txt_objids = objids except UnicodeDecodeError: stderr.write("{mfile} couldn't be opened\n". \ format(mfile = struct_mdata_file)) record = [n.identifier, id_parts[0], id_parts[1], id_parts[2], id_parts[3], ','.join(list(n.accessions)), has_old_struct, has_new_struct, has_pdf, has_pages, len(txt_objids), len(struct_objids), len(n.pages)] main_csv.add_record(record) issue_csv_file_name = "%s-pages.csv" % n.identifier issue_csv = Output(issue_csv_file_name) issue_csv.add_record(page_csv_headers) sorted_pages = sorted(n.pages) for page in sorted_pages: ocr_there = "Y" if getattr(page,'pos',None) or \ getattr(page,'alto',None) \ or getattr(p, 'xml',None) else "N" jpeg_there = "Y" if getattr(page, 'jpeg',None) else "N" tiff_there = "Y" if getattr(page, 'tiff',None) else "N" try: x = page.objectpage.split('_')[1].lstrip('0') object_identifier = x.zfill(8) except: object_identifier = page.objectpage page_record = [object_identifier, page.objectpage, n.identifier, ocr_there, jpeg_there, tiff_there] issue_csv.add_record(page_record) return 0 except KeyboardInterrupt: return 131