def main(): try: record = '0002' files_list = open("/media/sf_shared_with_ubuntu_guest_os/" + \ "mvol-00002-files-v2.txt",'r').readlines() accession_set = [] output = {} g = Grouping() rep_labels = set([]) for n in files_list: fullpath_dir = dirname(n) relpath_dir = relpath(fullpath_dir, '/media/repo/repository/ac/') accession = relpath_dir.split('/')[0] i = AccessionDirectory(join('/media/repo/repository/ac/', accession), '/media/repo/repository/ac/') a = AccessionItem(n,'/media/repo/repository/ac/') the_file_path = a.get_file_path().strip() a.set_canonical_filepath(a.find_canonical_filepath()) pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})') matches = pattern.search(a.get_canonical_filepath()) if matches: identifier = '-'.join([matches.group(1), matches.group(2), matches.group(3), matches.group(4)]) limb_files_required = [('txt', r'{identifier}.txt$'.format(identifier = identifier)), ('struct', r'{identifier}.struct.txt$'.format(identifier = identifier)), ('pdf', r'{identifier}.pdf$'.format(identifier = identifier)), ('tiff', r'TIFF/%s_\d{4,}.tif$' % identifier), ('jpeg', r'JPEG/%s_\d{4,}.jpg$' % identifier), ('xml', r'XML/%s_\d{4,}.xml$' % identifier), ('alto', r'ALTO/%s_\d{4,}.\w{3,4}$' % identifier)] prelimb_files_required = [('pdf', r'{identifier}.pdf$'.format(identifier = identifier)), ('txt', r'{identifier}.txt$'.format(identifier = identifier)), ('tiff', r'tif/\d{8,}.tif$'), ('jpeg', r'jpg/\d{8,}.jpg$'), ('xml', r'xml/\d{{8,}.xml$'), ('pos', r'pos/\d{8,}.pos$')] page_num_file_parts = [r'(\d{8})', r'_(\d{4})'] files_required = [] files_required.extend(limb_files_required) files_required.extend(prelimb_files_required) n = g.does_it_need_a_new_item(identifier) for tup in files_required: pattern = re_compile(tup[1]) label = tup[0] search = pattern.search(a.get_canonical_filepath()) if search: check = True is_page = False for ppart in page_num_file_parts: page_pattern = re_compile(ppart) page_search = page_pattern.search(a.get_canonical_filepath()) if page_search: is_page = True n.add_page(a, label, page_search.group(1), splitext(basename(a.get_canonical_filepath()))[0]) if not is_page: n.add_representation(a, label) else: logging.error("could not match file {filename}".format(filename = the_file_path)) g.sort_items() rep_labels = ['is there a .txt structural file','is there a -struct.xt structural file','is there a pdf file'] errors = {} errors_txt_file = "errors-%s.txt" % record error_file = open(errors_txt_file,"a") mvol_csv = "mvol-%s.csv" % record with open(mvol_csv, 'w') as csvfile: recwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) header = ['issue identifier'] header.extend(rep_labels) header.extend(['which accessions have files for this issue']) header.extend(['were page files found','number of pages in structural metadata', 'number of pages on disk']) recwriter.writerow(header) for n in g.items: repnum = "" rowstring = [] if 'Thum' in n.identifier or 'tif' in n.identifier or 'test' in n.identifier: continue rowstring.append(n.identifier) for rep in ['struct', 'txt', 'pdf']: l = getattr(n, rep, None) if l: num_value = "Y" else: num_value = "N" try: errors.get(n.identifier).append("missing {rep}".format(rep = rep)) except: errors[n.identifier] = ["missing {rep}".format(rep = rep)] error_file.write("\"{id}\",\"missing {rep}\"\n".format(id = identifier, rep = rep)) rowstring.append(str(num_value)) s1 = getattr(n,'struct',None) s2 = getattr(n,'txt',None) if s1: struct_mdata_files = s1 elif s2: struct_mdata_files = s2 if isinstance(struct_mdata_files, list): struct_mdata_files = [x.get_file_path().strip() for x in struct_mdata_files] else: struct_mdata_files = [x.get_file_path().strip()] objids = [] for struct_mdata_file in struct_mdata_files: try: fp = open(struct_mdata_file,'r') lines = fp.readlines() relevant_lines = lines[1:] objids = [x.replace('\t','').strip('\n') for x in relevant_lines] except UnicodeDecodeError: stderr.write("{mfile} couldn't be opened\n".format(mfile = struct_mdata_file)) objids = [] rowstring.append(','.join(list(n.accessions))) are_pages = 'Y' if len(n.pages) > 0 else 'N' rowstring.append(are_pages) rowstring.append(len(objids)) rowstring.append(len(n.pages)) if len(n.pages) == 0: try: errors.get(n.identifier).append("no pages") except: errors[n.identifier] = ["no pages"] error_file.write("\"{id}\",\"no pages\"\n".format(id = identifier)) recwriter.writerow(rowstring) pages_csv = "%s-pages.csv" % n.identifier pages_errors_csv = "%s-pages-errors.csv" % n.identifier with open(pages_csv,'w') as pagescsvfile: pageswriter = csv.writer(pagescsvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_ALL) pageswriter.writerow(['object','issue identifier', 'is there an ocr file', 'is there a jpeg file', 'is there a tiff file']) sorted_pages = sorted(n.pages) for p in sorted_pages: error_row = [] ocr_there = "Y" if getattr(p,'pos',None) or getattr(p,'alto',None) or getattr(p, 'xml',None) else "N" jpeg_there = "Y" if getattr(p, 'jpeg',None) else "N" tiff_there = "Y" if getattr(p, 'tiff',None) else "N" pageswriter.writerow([p.objectpage, n.identifier, ocr_there, jpeg_there, tiff_there]) errors_csv_filename = "errors-%s" % record with open(errors_csv_filename, "w") as errorfile: newrecwriter = csv.writer(errorfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for key,value in errors.items(): newrecwriter.writerow([key,','.join(value)]) return 0 except KeyboardInterrupt: return 131
def main(): try: files_list = open("/media/sf_shared_with_ubuntu_guest_os/" + \ "mvol-0001-files.txt",'r').readlines() accession_set = [] output = {} g = Grouping() rep_labels = set([]) for n in files_list: fullpath_dir = dirname(n) relpath_dir = relpath(fullpath_dir, '/media/repo/repository/ac/') accession = relpath_dir.split('/')[0] i = AccessionDirectory( join('/media/repo/repository/ac/', accession), '/media/repo/repository/ac/') a = AccessionItem(n, '/media/repo/repository/ac/') the_file_path = a.get_file_path().strip() a.set_canonical_filepath(a.find_canonical_filepath()) pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})') matches = pattern.search(a.get_canonical_filepath()) if matches: identifier = '-'.join([ matches.group(1), matches.group(2), matches.group(3), matches.group(4) ]) limb_files_required = [ ('dc', r'{identifier}.dc.xml$'.format(identifier=identifier)), ('dc-fits', r'{identifier}.dc.xml.fits.xml$'.format( identifier=identifier)), ('mets-fits', r'{identifier}.mets.xml.fits.xml$'.format( identifier=identifier)), ('mets', r'{identifier}.mets.xml$'.format(identifier=identifier)), ('txt', r'{identifier}.txt$'.format(identifier=identifier)), ('txt-fits', r'{identifier}.txt.fits.xml$'.format( identifier=identifier)), ('struct', r'{identifier}.struct.txt$'.format( identifier=identifier)), ('struct-fits', r'{identifier}.struct.txt.fits.xml$'.format( identifier=identifier)), ('pdf', r'{identifier}.pdf$'.format(identifier=identifier)), ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format( identifier=identifier)), ('tiff', r'TIFF/%s_\d{4}.tif$' % identifier), ('tiff-fits', r'TIFF/%s_\d{4}.tif.fits.xml$' % identifier), ('jpeg', r'JPEG/%s_\d{4}.jpg$' % identifier), ('jpeg-fits', r'JPEG/%s_\d{4}.jpg.fits.xml$' % identifier), ('alto', r'ALTO/%s_\d{4}.\w{3,4}$' % identifier), ('alto-fits', r'ALTO/%s_\d{4}.\w{3,4}.fits.xml$' % identifier) ] prelimb_files_required = [ ('dc', r'{identifier}.dc.xml$'.format(identifier=identifier)), ('dc-fits', r'{identifier}.dc.xml.fits.xml$'.format( identifier=identifier)), ('pdf', r'{identifier}.pdf$'.format(identifier=identifier)), ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format( identifier=identifier)), ('txt', r'{identifier}.txt$'.format(identifier=identifier)), ('txt-fits', r'{identifier}.txt.fits.xml$'.format( identifier=identifier)), ('tif', r'tif/\d{8}.tif$'), ('tif-fits', 'tif/\d{8}.tif.fits.xml$'), ('jpeg', r'jpg/\d{8}.jpg$'), ('jpg-fits', r'jpg/\d{8}.jpg.fits.xml$'), ('pos', r'pos/\d{8}.pos$'), ('pos-fits', r'pos/\d{8}.pos.fits.xml$') ] page_num_file_parts = [r'(\d{8})', r'_(\d{4})'] files_required = [] files_required.extend(limb_files_required) files_required.extend(prelimb_files_required) n = g.does_it_need_a_new_item(identifier) for tup in files_required: pattern = re_compile(tup[1]) label = tup[0] search = pattern.search(a.get_canonical_filepath()) if search: check = True is_page = False for ppart in page_num_file_parts: page_pattern = re_compile(ppart) page_search = page_pattern.search( a.get_canonical_filepath()) if page_search: is_page = True n.add_page(a, label, page_search.group(1)) if not is_page: rep_labels.add(label) n.add_representation(a, label) else: logging.error("could not match file {filename}".format( filename=the_file_path)) g.sort_items() rep_labels = list(rep_labels) with open('mvol-0001.csv', 'w') as csvfile: recwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) header = ['identifier'] header.extend(rep_labels) header.extend(['accession']) header.extend(['numpages']) recwriter.writerow(header) for n in g.items: repnum = "" rowstring = [] rowstring.append(n.identifier) for rep in n.representations: if getattr(n, rep): repnum = len(getattr(n, rep)) else: print(getattr(n, rep)) repnum = 0 rowstring.append(str(repnum)) rowstring.append(','.join(list(n.accessions))) rowstring.append(str(len(n.pages))) recwriter.writerow(rowstring) recwriter.close() return 0 except KeyboardInterrupt: return 131
def main(): try: files_list = open("/media/sf_shared_with_ubuntu_guest_os/" + \ "mvol-0001-files.txt",'r').readlines() accession_set = [] output = {} g = Grouping() rep_labels = set([]) for n in files_list: fullpath_dir = dirname(n) relpath_dir = relpath(fullpath_dir, '/media/repo/repository/ac/') accession = relpath_dir.split('/')[0] i = AccessionDirectory(join('/media/repo/repository/ac/', accession), '/media/repo/repository/ac/') a = AccessionItem(n,'/media/repo/repository/ac/') the_file_path = a.get_file_path().strip() a.set_canonical_filepath(a.find_canonical_filepath()) pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})') matches = pattern.search(a.get_canonical_filepath()) if matches: identifier = '-'.join([matches.group(1), matches.group(2), matches.group(3), matches.group(4)]) limb_files_required = [('dc',r'{identifier}.dc.xml$'.format(identifier = identifier)), ('dc-fits',r'{identifier}.dc.xml.fits.xml$'.format(identifier = identifier)), ('mets-fits', r'{identifier}.mets.xml.fits.xml$'.format(identifier = identifier)), ('mets', r'{identifier}.mets.xml$'.format(identifier = identifier)), ('txt', r'{identifier}.txt$'.format(identifier = identifier)), ('txt-fits', r'{identifier}.txt.fits.xml$'.format(identifier = identifier)), ('struct', r'{identifier}.struct.txt$'.format(identifier = identifier)), ('struct-fits', r'{identifier}.struct.txt.fits.xml$'.format(identifier = identifier)), ('pdf', r'{identifier}.pdf$'.format(identifier = identifier)), ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format(identifier = identifier)), ('tiff', r'TIFF/%s_\d{4}.tif$' % identifier), ('tiff-fits', r'TIFF/%s_\d{4}.tif.fits.xml$' % identifier), ('jpeg', r'JPEG/%s_\d{4}.jpg$' % identifier), ('jpeg-fits', r'JPEG/%s_\d{4}.jpg.fits.xml$' % identifier), ('alto', r'ALTO/%s_\d{4}.\w{3,4}$' % identifier), ('alto-fits', r'ALTO/%s_\d{4}.\w{3,4}.fits.xml$' % identifier)] prelimb_files_required = [('dc', r'{identifier}.dc.xml$'.format(identifier = identifier)), ('dc-fits', r'{identifier}.dc.xml.fits.xml$'.format(identifier = identifier)), ('pdf', r'{identifier}.pdf$'.format(identifier = identifier)), ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format(identifier = identifier)), ('txt', r'{identifier}.txt$'.format(identifier = identifier)), ('txt-fits', r'{identifier}.txt.fits.xml$'.format(identifier = identifier)), ('tif', r'tif/\d{8}.tif$'), ('tif-fits', 'tif/\d{8}.tif.fits.xml$'), ('jpeg', r'jpg/\d{8}.jpg$'), ('jpg-fits', r'jpg/\d{8}.jpg.fits.xml$'), ('pos', r'pos/\d{8}.pos$'), ('pos-fits', r'pos/\d{8}.pos.fits.xml$')] page_num_file_parts = [r'(\d{8})', r'_(\d{4})'] files_required = [] files_required.extend(limb_files_required) files_required.extend(prelimb_files_required) n = g.does_it_need_a_new_item(identifier) for tup in files_required: pattern = re_compile(tup[1]) label = tup[0] search = pattern.search(a.get_canonical_filepath()) if search: check = True is_page = False for ppart in page_num_file_parts: page_pattern = re_compile(ppart) page_search = page_pattern.search(a.get_canonical_filepath()) if page_search: is_page = True n.add_page(a, label, page_search.group(1)) if not is_page: rep_labels.add(label) n.add_representation(a, label) else: logging.error("could not match file {filename}".format(filename = the_file_path)) g.sort_items() rep_labels = list(rep_labels) with open('mvol-0001.csv', 'w') as csvfile: recwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) header = ['identifier'] header.extend(rep_labels) header.extend(['accession']) header.extend(['numpages']) recwriter.writerow(header) for n in g.items: repnum = "" rowstring = [] rowstring.append(n.identifier) for rep in n.representations: if getattr(n, rep): repnum = len(getattr(n, rep)) else: print(getattr(n,rep)) repnum = 0 rowstring.append(str(repnum)) rowstring.append(','.join(list(n.accessions))) rowstring.append(str(len(n.pages))) recwriter.writerow(rowstring) recwriter.close() return 0 except KeyboardInterrupt: return 131