Exemplo n.º 1
0
 def _purge_batch(self, batch):
     batch_name = batch.name
     # just delete batch causes memory to bloat out
     # so we do it piece-meal
     for issue in batch.issues.all():
         for page in issue.pages.all():
             page.delete()
             # remove coordinates
             if os.path.exists(models.coordinates_path(page._url_parts())):
                 os.remove(models.coordinates_path(page._url_parts()))
         issue.delete()
     batch.delete()
     if self.PROCESS_OCR:
         self.solr.delete_query('batch:"%s"' % batch_name)
         self.solr.commit()
Exemplo n.º 2
0
    def _process_coordinates(self, page, coords):
        _logger.debug("writing out word coords for %s" %
            page.url)

        f = open(models.coordinates_path(page._url_parts()), "w")
        f.write(gzip_compress(json.dumps(coords)))
        f.close()
Exemplo n.º 3
0
    def _process_coordinates(self, page, coords):
        _logger.debug("writing out word coords for %s" %
            page.url)

        f = open(models.coordinates_path(page._url_parts()), "w")
        f.write(gzip_compress(json.dumps(coords)))
        f.close()
Exemplo n.º 4
0
def coordinates(request, lccn, date, edition, sequence, words=None):
    url_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence)

    file_path = models.coordinates_path(url_parts)

    try:
        with gzip.open(file_path, 'rb') as i:
            response = HttpResponse(i.read(), content_type='application/json')
            return add_cache_tag(response, "lccn=%s" % lccn)
    except IOError:
        LOGGER.warning('Word coordinates file %s does not exist', file_path)
        raise Http404
Exemplo n.º 5
0
def coordinates(request, lccn, date, edition, sequence, words=None):
    url_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence)

    file_path = models.coordinates_path(url_parts)

    try:
        with gzip.open(file_path, 'rb') as i:
            response = HttpResponse(i.read(), content_type='application/json')
            return add_cache_tag(response, "lccn=%s" % lccn)
    except IOError:
        LOGGER.warning('Word coordinates file %s does not exist', file_path)
        raise Http404
Exemplo n.º 6
0
def coordinates(request, lccn, date, edition, sequence, words=None):
    url_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence)
    try:
        f = open(models.coordinates_path(url_parts))
    except IOError:
        return HttpResponse()
    data = f.read()

    r = HttpResponse(mimetype='application/json')
    r['Content-Encoding'] = 'gzip'
    r['Content-Length'] = len(data)
    r.write(data)
    f.close()
    return r
Exemplo n.º 7
0
def coordinates(request, lccn, date, edition, sequence, words=None):
    url_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence)
    try:
        f = open(models.coordinates_path(url_parts))
    except IOError:
        return HttpResponse()
    data = f.read()

    r = HttpResponse(mimetype='application/json')
    r['Content-Encoding'] = 'gzip'
    r['Content-Length'] = len(data)
    r.write(data)
    f.close()
    return r
Exemplo n.º 8
0
    def _process_coordinates(self, page, coords):
        LOGGER.debug("writing out word coords for %s", page.url)

        fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE)  # get a temp file in case the coordinates dir is a NFS or S3 mount which have poor multiple write performance
        f = open(path, "w")
        f.write(gzip_compress(json.dumps(coords)))
        f.close()
        os.close(fd)
        final_path = models.coordinates_path(page._url_parts())
        try:
            shutil.move(path, final_path)
        except Exception:
            LOGGER.warn("Could not move coordinates to [%s]. Waiting 5 seconds and trying again in case of network mount", final_path)
            time.sleep(5)
            shutil.move(path, final_path)
Exemplo n.º 9
0
def coordinates(request, lccn, date, edition, sequence, words=None):
    url_parts = {
        "lccn": lccn,
        "date": date,
        "edition": edition,
        "sequence": sequence
    }

    file_path = models.coordinates_path(url_parts)

    try:
        with gzip.open(file_path, "rb") as i:
            return HttpResponse(i.read(), content_type="application/json")
    except IOError:
        LOGGER.warning("Word coordinates file %s does not exist", file_path)
        raise Http404
Exemplo n.º 10
0
def coordinates(request, lccn, date, edition, sequence, words=None):
    url_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence)
    try:
        file_data = gzip.open(models.coordinates_path(url_parts), 'rb')
    except IOError:
        return HttpResponse()

    data = json.load(file_data)

    non_lexemes = re.compile('''^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$|'s$''')
    return_coords = data.copy()
    # reset coords to {} and build afresh, getting rid of unwanted punctuations
    return_coords['coords'] = {}
    for key in data.get('coords'):
        return_coords['coords'][re.sub(non_lexemes, '', key)] = data['coords'][key]

    r = HttpResponse(mimetype='application/json')
    r.write(json.dumps(return_coords))
    return r
Exemplo n.º 11
0
def coordinates(request, lccn, date, edition, sequence, words=None):
    url_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence)
    try:
        file_data = gzip.open(models.coordinates_path(url_parts), 'rb')
    except IOError:
        return HttpResponse()

    data = json.load(file_data)

    non_lexemes = re.compile('''^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$|'s$''')
    return_coords = data.copy()
    # reset coords to {} and build afresh, getting rid of unwanted punctuations
    return_coords['coords'] = {}
    for key in data.get('coords'):
        return_coords['coords'][re.sub(non_lexemes, '', key)] = data['coords'][key]

    r = HttpResponse(content_type='application/json')
    r.write(json.dumps(return_coords))
    return r
Exemplo n.º 12
0
    def _process_coordinates(self, page, coords):
        LOGGER.debug("writing out word coords for %s", page.url)

        # We'll use a temporary file in case the coordinates dir is configured
        # to a network filesystem which has poor update performance
        # characteristics
        fd, path = tempfile.mkstemp(text="w",
                                    suffix=".coordinates",
                                    dir=settings.TEMP_STORAGE)
        f = open(path, "w")
        f.write(gzip_compress(json.dumps(coords)))
        f.close()
        os.close(fd)
        final_path = models.coordinates_path(page._url_parts())
        try:
            shutil.move(path, final_path)
        except Exception:
            LOGGER.warning(
                'Could not move coordinates to "%s". Waiting 5 seconds before trying again…',
                final_path)
            time.sleep(5)
            shutil.move(path, final_path)