def test_dup_images(self):
        """plotextractor - remove_dups images"""
        images_and_captions = [('img1', 'caption1', 'label1', 'FIXME1'), ('img1', 'caption2', 'label1', 'FIXME1')]

        pared_images_and_captions = remove_dups(images_and_captions)
        self.assertTrue(pared_images_and_captions == [('img1', 'caption1 : caption2', 'label1', 'FIXME1')], \
                'didn\'t merge captions correctly')
示例#2
0
    def test_dup_captions(self):
        images_and_captions = [('img1', 'caption1'), ('img1', 'caption1'),\
                               ('img1', 'caption2')]

        pared_images_and_captions = remove_dups(images_and_captions)
        assert pared_images_and_captions==[('img1', 'caption1 : caption2')],\
                'didn\'t merge captions correctly'
    def test_no_dups(self):
        """plotextractor - remove_dups no dupes"""
        images_and_captions = [('img1', 'caption1', 'label1', 'FIXME1'),
                               ('img2', 'caption2', 'label1', 'FIXME1')]

        pared_images_and_captions = remove_dups(images_and_captions)
        self.assertTrue(pared_images_and_captions == images_and_captions, 'removed nondup')
示例#4
0
    def test_dup_captions(self):
        """plotextractor - remove_dups captions"""
        images_and_captions = [('img1', 'caption1', 'label1', 'FIXME1'), ('img1', 'caption1', 'label1', 'FIXME1'), \
                               ('img1', 'caption2', 'label1', 'FIXME1')]

        pared_images_and_captions = remove_dups(images_and_captions)
        self.assertTrue(pared_images_and_captions == [('img1', 'caption1 : caption2', 'label1', 'FIXME1')], \
                'didn\'t merge captions correctly')
示例#5
0
    def test_no_dups(self):
        """plotextractor - remove_dups no dupes"""
        images_and_captions = [('img1', 'caption1', 'label1', 'FIXME1'),
                               ('img2', 'caption2', 'label1', 'FIXME1')]

        pared_images_and_captions = remove_dups(images_and_captions)
        self.assertTrue(pared_images_and_captions == images_and_captions,
                        'removed nondup')
示例#6
0
def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \
                   upload_plots=False, force=False, squash="", \
                   yes_i_know=False, refno_url="", \
                   clean=False):
    """
    Processes one tarball end-to-end.

    @param: tarball (string): the absolute location of the tarball we wish
        to process
    @param: sdir (string): where we should put all the intermediate files for
        the processing.  if you're uploading, this directory should be one
        of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else
        the upload won't work
    @param: xtract_text (boolean): true iff you want to run pdftotext on the
        pdf versions of the tarfiles.  this programme assumes that the pdfs
        are named the same as the tarballs but with a .pdf extension.
    @param: upload_plots (boolean): true iff you want to bibupload the plots
        extracted by this process
    @param: force (boolean): force creation of new xml file
    @param: squash: write MARCXML output into a specified 'squash' file
        instead of single files.
    @param: yes_i_know: if True, no user interaction if upload_plots is True
    @param: refno_url: URL to the invenio-instance to query for refno.
    @param: clean: if True, everything except the original tarball, plots and
            context- files will be removed

    @return: marc_name(string): path to generated marcxml file
    """
    sub_dir, refno = get_defaults(tarball, sdir, refno_url)
    if not squash:
        marc_name = os.path.join(sub_dir, '%s.xml' % (refno,))
        if (force or not os.path.exists(marc_name)):
            marc_fd = open(marc_name, 'w')
            marc_fd.write('<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n')
            marc_fd.close()
    else:
        marc_name = squash
    if xtract_text:
        extract_text(tarball)
    try:
        extracted_files_list, image_list, tex_files = untar(tarball, sub_dir)
    except Timeout:
        write_message('Timeout during tarball extraction on %s' % (tarball,))
        return
    if tex_files == [] or tex_files == None:
        write_message('%s is not a tarball' % (os.path.split(tarball)[-1],))
        run_shell_command('rm -r %s', (sub_dir,))
        return

    converted_image_list = convert_images(image_list)
    write_message('converted %d of %d images found for %s' % (len(converted_image_list), \
                                                              len(image_list), \
                                                              os.path.basename(tarball)))
    extracted_image_data = []

    for tex_file in tex_files:
        # Extract images, captions and labels
        partly_extracted_image_data = extract_captions(tex_file, sub_dir, \
                                                converted_image_list)
        if partly_extracted_image_data != []:
            # Add proper filepaths and do various cleaning
            cleaned_image_data = prepare_image_data(partly_extracted_image_data, \
                                                  tex_file, converted_image_list)
            # Using prev. extracted info, get contexts for each image found
            extracted_image_data.extend((extract_context(tex_file, cleaned_image_data)))
    extracted_image_data = remove_dups(extracted_image_data)
    if extracted_image_data == []:
        write_message('No plots detected in %s' % (refno,))
    else:
        if refno_url == "":
            refno = None
        create_contextfiles(extracted_image_data)
        marc_xml = create_MARC(extracted_image_data, tarball, refno)
        if not squash:
            marc_xml += "\n</collection>"
        if marc_name != None:
            marc_fd = open(marc_name, 'a')
            marc_fd.write('%s\n' % (marc_xml,))
            marc_fd.close()
            if not squash:
                write_message('generated %s' % (marc_name,))
                if upload_plots:
                    upload_to_site(marc_name, yes_i_know)
    if clean:
        clean_up(extracted_files_list, image_list)
    write_message('work complete on %s' % (os.path.split(tarball)[-1],))
    return marc_name
示例#7
0
    def test_no_dups(self):
        images_and_captions = [('img1', 'caption1'), ('img2', 'caption2')]

        pared_images_and_captions = remove_dups(images_and_captions)
        assert pared_images_and_captions==images_and_captions, 'removed nondup'
    def test_dup_images(self):
        images_and_captions = [('img1', 'caption1', 'label1'), ('img1', 'caption2', 'label1')]

        pared_images_and_captions = remove_dups(images_and_captions)
        self.assertTrue(pared_images_and_captions == [('img1', 'caption1 : caption2', 'label1')], \
                'didn\'t merge captions correctly')