def get_info(pid, words=None, extra_previews=True): def b64img(img): data = io.BytesIO() img.save(data, format='JPEG', quality=85) return base64.b64encode(data.getvalue()).decode() mh = MediaHaven() alto = mh.get_alto(pid) result = dict( pid=pid, words=len(words) if words is not None else 0, alto=alto.search_words(words), alto_link=alto.url ) result['ocr_text'] = alto.text with mh.get_preview(pid) as im: result['previewImageUrl'] = im.meta['previewImagePath'] result['meta'] = im.meta if result['words'] > 0 and extra_previews: result['preview_full'] = b64img(im.highlight_words(words, crop=False)) result['preview'] = b64img(im.highlight_words(words)) result['props'] = im.meta['mdProperties'] return result
class Importer: def __init__(self): self._solr = Solr(Config(section='solr')['url']) self._mh = MediaHaven(buffer_size=100) def add(self, item): self._solr.add([item]) def process(self, item): if item is None: raise Exception("Invalid item passed (None)") if type(item) is not str: pid = item['externalId'] else: pid = item item = self._mh.one('+(externalId:%s)' % pid) if not pid: raise "No pid for item %s" % (item, ) language = '' try: language = item['mdProperties']['language'][0].lower() except Exception as e: logger.warning('no language found for %s', pid) logger.exception(e) alto = self._mh.get_alto(item) if not alto: logger.debug("no alto for pid '%s' " % (pid, )) text = '' else: text = Conversions.normalize(alto.text) self.add(dict(id=pid, text=text, language=language))