def main():
    try:
        record = '0002'
        files_list = open("/media/sf_shared_with_ubuntu_guest_os/" + \
        "mvol-00002-files-v2.txt",'r').readlines()
        accession_set = []
        output = {}
        g = Grouping()
        rep_labels = set([])
        for n in files_list:
            fullpath_dir = dirname(n)
            relpath_dir = relpath(fullpath_dir, '/media/repo/repository/ac/')
            accession = relpath_dir.split('/')[0]
            i = AccessionDirectory(join('/media/repo/repository/ac/', 
                                        accession), 
                                   '/media/repo/repository/ac/')
            a = AccessionItem(n,'/media/repo/repository/ac/')
            the_file_path = a.get_file_path().strip()
            a.set_canonical_filepath(a.find_canonical_filepath())
            pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})')
            matches = pattern.search(a.get_canonical_filepath())
            if matches:
                identifier = '-'.join([matches.group(1), matches.group(2), 
                                       matches.group(3), matches.group(4)])
                
                limb_files_required = [('txt', r'{identifier}.txt$'.format(identifier = identifier)),
                                       ('struct', r'{identifier}.struct.txt$'.format(identifier = identifier)),
                                       ('pdf', r'{identifier}.pdf$'.format(identifier = identifier)),
                                       ('tiff', r'TIFF/%s_\d{4,}.tif$' % identifier),
                                       ('jpeg', r'JPEG/%s_\d{4,}.jpg$' % identifier),
                                       ('xml', r'XML/%s_\d{4,}.xml$' % identifier),
                                       ('alto', r'ALTO/%s_\d{4,}.\w{3,4}$' % identifier)]

    
                prelimb_files_required = [('pdf', r'{identifier}.pdf$'.format(identifier = identifier)),
                                          ('txt', r'{identifier}.txt$'.format(identifier = identifier)),
                                          ('tiff',  r'tif/\d{8,}.tif$'),
                                          ('jpeg', r'jpg/\d{8,}.jpg$'),
                                          ('xml', r'xml/\d{{8,}.xml$'),
                                          ('pos', r'pos/\d{8,}.pos$')]

                page_num_file_parts = [r'(\d{8})', r'_(\d{4})']
                files_required = []
                files_required.extend(limb_files_required)
                files_required.extend(prelimb_files_required)
                n = g.does_it_need_a_new_item(identifier)

                for tup in files_required:
                    pattern = re_compile(tup[1])
                    label  = tup[0]
                    search = pattern.search(a.get_canonical_filepath())
                    if search:
                        check = True
                        is_page = False
                        for ppart in page_num_file_parts:
                            page_pattern = re_compile(ppart)
                            page_search = page_pattern.search(a.get_canonical_filepath())
                            if page_search:
                                is_page = True
                                n.add_page(a, label, page_search.group(1), 
                                           splitext(basename(a.get_canonical_filepath()))[0])
                        if not is_page:

                            n.add_representation(a, label)
            else:
                logging.error("could not match file {filename}".format(filename = the_file_path))
        g.sort_items()
        rep_labels = ['is there a .txt structural file','is there a -struct.xt structural file','is there a pdf file']
        errors = {}
        errors_txt_file = "errors-%s.txt" % record
        error_file = open(errors_txt_file,"a")
        mvol_csv = "mvol-%s.csv" % record
        with open(mvol_csv, 'w') as csvfile:
            recwriter = csv.writer(csvfile, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_ALL)
            header = ['issue identifier']
            header.extend(rep_labels)
            header.extend(['which accessions have files for this issue'])
            header.extend(['were page files found','number of pages in structural metadata', 'number of pages on disk'])
            recwriter.writerow(header)
            for n in g.items:
                repnum = ""
                rowstring = []
                if 'Thum' in n.identifier or 'tif' in n.identifier or 'test' in n.identifier:
                    continue
                rowstring.append(n.identifier)
                for rep in ['struct', 'txt', 'pdf']:
                    l = getattr(n, rep, None)
                    if l:
                        num_value = "Y"
                    else: 
                        num_value = "N"
                        try:
                            errors.get(n.identifier).append("missing {rep}".format(rep = rep))
                        except:
                            errors[n.identifier] = ["missing {rep}".format(rep = rep)]
                        error_file.write("\"{id}\",\"missing {rep}\"\n".format(id = identifier, rep = rep))
                    rowstring.append(str(num_value))
                s1 = getattr(n,'struct',None)
                s2 = getattr(n,'txt',None)
                if s1:
                    struct_mdata_files = s1
                elif s2:
                    struct_mdata_files = s2
        
                if isinstance(struct_mdata_files, list):
                    struct_mdata_files = [x.get_file_path().strip() for x in struct_mdata_files]
                else:
                    struct_mdata_files = [x.get_file_path().strip()]
                objids = []
                for struct_mdata_file in struct_mdata_files:
                    try:
                        fp = open(struct_mdata_file,'r')
                        lines = fp.readlines()
                        relevant_lines = lines[1:]
                        objids = [x.replace('\t','').strip('\n') for x in relevant_lines]
                    except UnicodeDecodeError:
                        stderr.write("{mfile} couldn't be opened\n".format(mfile = struct_mdata_file))
                        objids = []
                rowstring.append(','.join(list(n.accessions)))
                are_pages = 'Y' if len(n.pages) > 0 else 'N'
                rowstring.append(are_pages)
                rowstring.append(len(objids))
                rowstring.append(len(n.pages))
                if len(n.pages) == 0:
                    try:
                        errors.get(n.identifier).append("no pages")
                    except:
                        errors[n.identifier] = ["no pages"]
                    error_file.write("\"{id}\",\"no pages\"\n".format(id = identifier))
                recwriter.writerow(rowstring)
                pages_csv = "%s-pages.csv" % n.identifier
                pages_errors_csv = "%s-pages-errors.csv" % n.identifier
                with open(pages_csv,'w') as pagescsvfile:
                    pageswriter = csv.writer(pagescsvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_ALL)
                    pageswriter.writerow(['object','issue identifier',
                                          'is there an ocr file',
                                          'is there a jpeg file',
                                          'is there a tiff file'])
                    sorted_pages = sorted(n.pages)
                    
                    for p in sorted_pages:
                        error_row = []
                        ocr_there = "Y" if getattr(p,'pos',None) or getattr(p,'alto',None) or getattr(p, 'xml',None) else "N"
                        jpeg_there = "Y" if getattr(p, 'jpeg',None) else "N"
                        tiff_there = "Y" if getattr(p, 'tiff',None) else "N"
                        pageswriter.writerow([p.objectpage, n.identifier, 
                                              ocr_there, jpeg_there, tiff_there])

        errors_csv_filename = "errors-%s" % record
        with open(errors_csv_filename, "w") as errorfile:
            newrecwriter = csv.writer(errorfile, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_ALL)

            for key,value in errors.items():
                newrecwriter.writerow([key,','.join(value)])

        return 0
    except KeyboardInterrupt:
        return 131
示例#2
0
def main():
    try:
        files_list = open("/media/sf_shared_with_ubuntu_guest_os/" + \
        "mvol-0001-files.txt",'r').readlines()
        accession_set = []
        output = {}
        g = Grouping()
        rep_labels = set([])
        for n in files_list:
            fullpath_dir = dirname(n)
            relpath_dir = relpath(fullpath_dir, '/media/repo/repository/ac/')
            accession = relpath_dir.split('/')[0]
            i = AccessionDirectory(
                join('/media/repo/repository/ac/', accession),
                '/media/repo/repository/ac/')
            a = AccessionItem(n, '/media/repo/repository/ac/')
            the_file_path = a.get_file_path().strip()
            a.set_canonical_filepath(a.find_canonical_filepath())
            pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})')
            matches = pattern.search(a.get_canonical_filepath())
            if matches:
                identifier = '-'.join([
                    matches.group(1),
                    matches.group(2),
                    matches.group(3),
                    matches.group(4)
                ])
                limb_files_required = [
                    ('dc',
                     r'{identifier}.dc.xml$'.format(identifier=identifier)),
                    ('dc-fits', r'{identifier}.dc.xml.fits.xml$'.format(
                        identifier=identifier)),
                    ('mets-fits', r'{identifier}.mets.xml.fits.xml$'.format(
                        identifier=identifier)),
                    ('mets',
                     r'{identifier}.mets.xml$'.format(identifier=identifier)),
                    ('txt',
                     r'{identifier}.txt$'.format(identifier=identifier)),
                    ('txt-fits', r'{identifier}.txt.fits.xml$'.format(
                        identifier=identifier)),
                    ('struct', r'{identifier}.struct.txt$'.format(
                        identifier=identifier)),
                    ('struct-fits',
                     r'{identifier}.struct.txt.fits.xml$'.format(
                         identifier=identifier)),
                    ('pdf',
                     r'{identifier}.pdf$'.format(identifier=identifier)),
                    ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format(
                        identifier=identifier)),
                    ('tiff', r'TIFF/%s_\d{4}.tif$' % identifier),
                    ('tiff-fits', r'TIFF/%s_\d{4}.tif.fits.xml$' % identifier),
                    ('jpeg', r'JPEG/%s_\d{4}.jpg$' % identifier),
                    ('jpeg-fits', r'JPEG/%s_\d{4}.jpg.fits.xml$' % identifier),
                    ('alto', r'ALTO/%s_\d{4}.\w{3,4}$' % identifier),
                    ('alto-fits',
                     r'ALTO/%s_\d{4}.\w{3,4}.fits.xml$' % identifier)
                ]

                prelimb_files_required = [
                    ('dc',
                     r'{identifier}.dc.xml$'.format(identifier=identifier)),
                    ('dc-fits', r'{identifier}.dc.xml.fits.xml$'.format(
                        identifier=identifier)),
                    ('pdf',
                     r'{identifier}.pdf$'.format(identifier=identifier)),
                    ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format(
                        identifier=identifier)),
                    ('txt',
                     r'{identifier}.txt$'.format(identifier=identifier)),
                    ('txt-fits', r'{identifier}.txt.fits.xml$'.format(
                        identifier=identifier)), ('tif', r'tif/\d{8}.tif$'),
                    ('tif-fits', 'tif/\d{8}.tif.fits.xml$'),
                    ('jpeg', r'jpg/\d{8}.jpg$'),
                    ('jpg-fits', r'jpg/\d{8}.jpg.fits.xml$'),
                    ('pos', r'pos/\d{8}.pos$'),
                    ('pos-fits', r'pos/\d{8}.pos.fits.xml$')
                ]

                page_num_file_parts = [r'(\d{8})', r'_(\d{4})']

                files_required = []
                files_required.extend(limb_files_required)
                files_required.extend(prelimb_files_required)
                n = g.does_it_need_a_new_item(identifier)

                for tup in files_required:
                    pattern = re_compile(tup[1])
                    label = tup[0]
                    search = pattern.search(a.get_canonical_filepath())
                    if search:
                        check = True
                        is_page = False
                        for ppart in page_num_file_parts:
                            page_pattern = re_compile(ppart)
                            page_search = page_pattern.search(
                                a.get_canonical_filepath())
                            if page_search:
                                is_page = True
                                n.add_page(a, label, page_search.group(1))
                        if not is_page:
                            rep_labels.add(label)
                            n.add_representation(a, label)
            else:
                logging.error("could not match file {filename}".format(
                    filename=the_file_path))
        g.sort_items()
        rep_labels = list(rep_labels)
        with open('mvol-0001.csv', 'w') as csvfile:
            recwriter = csv.writer(csvfile,
                                   delimiter=',',
                                   quotechar='"',
                                   quoting=csv.QUOTE_ALL)
            header = ['identifier']
            header.extend(rep_labels)
            header.extend(['accession'])
            header.extend(['numpages'])
            recwriter.writerow(header)
            for n in g.items:
                repnum = ""
                rowstring = []
                rowstring.append(n.identifier)
                for rep in n.representations:
                    if getattr(n, rep):
                        repnum = len(getattr(n, rep))
                    else:
                        print(getattr(n, rep))
                        repnum = 0
                    rowstring.append(str(repnum))
                rowstring.append(','.join(list(n.accessions)))
                rowstring.append(str(len(n.pages)))
                recwriter.writerow(rowstring)
            recwriter.close()
        return 0
    except KeyboardInterrupt:
        return 131
def main():
    try:
        files_list = open("/media/sf_shared_with_ubuntu_guest_os/" + \
        "mvol-0001-files.txt",'r').readlines()
        accession_set = []
        output = {}
        g = Grouping()
        rep_labels = set([])
        for n in files_list:
            fullpath_dir = dirname(n)
            relpath_dir = relpath(fullpath_dir, '/media/repo/repository/ac/')
            accession = relpath_dir.split('/')[0]
            i = AccessionDirectory(join('/media/repo/repository/ac/', 
                                        accession), 
                                   '/media/repo/repository/ac/')
            a = AccessionItem(n,'/media/repo/repository/ac/')
            the_file_path = a.get_file_path().strip()
            a.set_canonical_filepath(a.find_canonical_filepath())
            pattern = re_compile(r'(mvol)/(\w{4})/(\w{4})/(\w{4})')
            matches = pattern.search(a.get_canonical_filepath())
            if matches:
                identifier = '-'.join([matches.group(1), matches.group(2), 
                                       matches.group(3), matches.group(4)])
                limb_files_required = [('dc',r'{identifier}.dc.xml$'.format(identifier  = identifier)),
                                       ('dc-fits',r'{identifier}.dc.xml.fits.xml$'.format(identifier  = identifier)),
                                       ('mets-fits', r'{identifier}.mets.xml.fits.xml$'.format(identifier  = identifier)),
                                       ('mets', r'{identifier}.mets.xml$'.format(identifier  = identifier)),
                                       ('txt', r'{identifier}.txt$'.format(identifier = identifier)),
                                       ('txt-fits', r'{identifier}.txt.fits.xml$'.format(identifier = identifier)),
                                       ('struct', r'{identifier}.struct.txt$'.format(identifier = identifier)),
                                       ('struct-fits', r'{identifier}.struct.txt.fits.xml$'.format(identifier = identifier)),
                                       ('pdf', r'{identifier}.pdf$'.format(identifier = identifier)),
                                       ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format(identifier = identifier)),
                                       ('tiff', r'TIFF/%s_\d{4}.tif$' % identifier),
                                       ('tiff-fits', r'TIFF/%s_\d{4}.tif.fits.xml$' % identifier),
                                       ('jpeg', r'JPEG/%s_\d{4}.jpg$' % identifier),
                                       ('jpeg-fits', r'JPEG/%s_\d{4}.jpg.fits.xml$' % identifier),
                                       ('alto', r'ALTO/%s_\d{4}.\w{3,4}$' % identifier),
                                       ('alto-fits', r'ALTO/%s_\d{4}.\w{3,4}.fits.xml$' % identifier)]
    
                prelimb_files_required = [('dc', r'{identifier}.dc.xml$'.format(identifier = identifier)),
                                          ('dc-fits', r'{identifier}.dc.xml.fits.xml$'.format(identifier = identifier)),
                                          ('pdf', r'{identifier}.pdf$'.format(identifier = identifier)),
                                          ('pdf-fits', r'{identifier}.pdf.fits.xml$'.format(identifier = identifier)),
                                          ('txt', r'{identifier}.txt$'.format(identifier = identifier)),
                                          ('txt-fits', r'{identifier}.txt.fits.xml$'.format(identifier = identifier)),
                                          ('tif',  r'tif/\d{8}.tif$'),
                                          ('tif-fits', 'tif/\d{8}.tif.fits.xml$'),
                                          ('jpeg', r'jpg/\d{8}.jpg$'),
                                          ('jpg-fits', r'jpg/\d{8}.jpg.fits.xml$'),
                                          ('pos', r'pos/\d{8}.pos$'),
                                          ('pos-fits', r'pos/\d{8}.pos.fits.xml$')]

                page_num_file_parts = [r'(\d{8})', r'_(\d{4})']

                files_required = []
                files_required.extend(limb_files_required)
                files_required.extend(prelimb_files_required)
                n = g.does_it_need_a_new_item(identifier)

                for tup in files_required:
                    pattern = re_compile(tup[1])
                    label  = tup[0]
                    search = pattern.search(a.get_canonical_filepath())
                    if search:
                        check = True
                        is_page = False
                        for ppart in page_num_file_parts:
                            page_pattern = re_compile(ppart)
                            page_search = page_pattern.search(a.get_canonical_filepath())
                            if page_search:
                                is_page = True
                                n.add_page(a, label, page_search.group(1))
                        if not is_page:
                            rep_labels.add(label)
                            n.add_representation(a, label)
            else:
                logging.error("could not match file {filename}".format(filename = the_file_path))
        g.sort_items()
        rep_labels = list(rep_labels)
        with open('mvol-0001.csv', 'w') as csvfile:
            recwriter = csv.writer(csvfile, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_ALL)
            header = ['identifier']
            header.extend(rep_labels)
            header.extend(['accession'])
            header.extend(['numpages'])
            recwriter.writerow(header)
            for n in g.items:
                repnum = ""
                rowstring = []
                rowstring.append(n.identifier)
                for rep in n.representations:
                    if getattr(n, rep):
                        repnum = len(getattr(n, rep))
                    else:
                        print(getattr(n,rep))
                        repnum = 0
                    rowstring.append(str(repnum))
                rowstring.append(','.join(list(n.accessions)))
                rowstring.append(str(len(n.pages)))
                recwriter.writerow(rowstring)
            recwriter.close()
        return 0
    except KeyboardInterrupt:
        return 131