def solr_index(self): """ Write out to solr """ solr_conn = SolrConnection(settings.SOLR_URL, persistent=False) solr_conn.add(**self.solr_doc) solr_conn.commit()
def index_evidence(evidence): evidence_medicine_list = [] evidence_medicine = MedicineEvidenceSummary.objects.filter(evidence=evidence.id) for evimed in evidence_medicine: if evimed.medicine.name not in evidence_medicine_list: evidence_medicine_list.append(evimed.medicine.name) # try to create a connection to a solr server and send medicine try: solr = SolrConnection(settings.SOLR_URL) solr.add( id = "evidence-%s-%s" % (evidence.language, evidence.id), type = "evidence", title = evidence.title, description = evidence.description, context = evidence.context, question = evidence.question, link = evidence.link, file = evidence.file, language = evidence.language, evidence_medicine = evidence_medicine_list, ) response = solr.commit() except Exception as ex: return False return True
def index_title(title, solr=None): if solr==None: solr = SolrConnection(settings.SOLR) _log.info("indexing title: lccn=%s" % title.lccn) try: solr.add(**title.solr_doc) except Exception, e: _log.exception(e)
def index_title(title, solr=None): if solr == None: solr = SolrConnection(settings.SOLR) _log.info("indexing title: lccn=%s" % title.lccn) try: solr.add(**title.solr_doc) except Exception as e: _log.exception(e)
def index_title(title, solr=None): if solr is None: solr = SolrConnection(settings.SOLR) LOGGER.info("indexing title: lccn=%s", title.lccn) try: solr.add(**title.solr_doc) except Exception as e: LOGGER.exception(e)
def index_title(title, solr=None): if solr is None: solr = SolrConnection(settings.SOLR) LOGGER.debug("indexing title: lccn=%s", title.lccn) try: solr.add(**title.solr_doc) except Exception: LOGGER.exception("Unable to index title %s", title)
def _refresh(field=None, data=None, path = None, isCron = None): from solr import SolrConnection from ID3 import * s = SolrConnection(SOLR_URL) if path and path != '*': #called by user pathsArr = path.split(',') else: #called from cron pathsArr = folderpaths matches = [] #handles modify, add #deletion will be handled in search when file in solr but not in path time.time() for path in pathsArr: for root, dirnames, filenames in os.walk(path): for extension in ['txt', 'log', 'py', 'pl', 'sql', 'mp3']: for filename in fnmatch.filter(filenames, '*.' + extension): fullName = os.path.join(root, filename) if os.path.getsize(fullName) > 8800000: continue #print fullName if not isCron or (time.time() - os.path.getmtime(fullName) < 24*60*60): try: #data = open(fullName, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ') if filename.endswith(('.txt', '.log', '.py', '.pl', '.sql')): data = open(fullName, 'r').read() data = filterTxt(data) else: audiofile = ID3(fullName) audiofilekeys = audiofile.keys() if 'TITLE' in audiofilekeys: data = audiofile['TITLE'] + " " if 'ARTIST' in audiofilekeys: data += audiofile['ARTIST'] + " " if 'ALBUM' in audiofilekeys: data += audiofile['ALBUM'] + " " if not data: data = '' data = data.strip() fullName = filterTxt(fullName) filename = filterTxt(filename) s.add(id = fullName, name = filename, txt = data) s.commit() except: pass #print data #print traceback.format_exc() #print fullName #sys.exit() gc.collect()
def index_missing_pages(): """ index all pages that are missing from solr in the database """ solr = SolrConnection(settings.SOLR) count = 0 pages = models.Page.objects.filter(indexed=False).all() number_of_pages = len(pages) for page in pages: LOGGER.info("[%s of %s] indexing page: %s", count, number_of_pages, page.url) solr.add(**page.solr_doc) count += 1 page.indexed = True page.save() solr.commit()
def build_index(**kwargs): """ gets product/sku information from cps DB and indexes them in solr existing solr index is wiped before indexing. Revisit if this strategy does not work """ # index status log message granularity log_index_status_chunks = 25000 solr = SolrConnection(settings.SOLR) clear_index(solr=solr) count = 0 fieldnames = ( "name", "id", "description", "long_description", "age", "gender", "brand", "str_brand", "merchant", "str_merchant", "category", "str_category", "price", "sale_price", "buy_url", "image", ) start = datetime.now() log.info("Reading product info from the database.....") products = db.get_cps_data() log.info("Building SOLR index.....") for product in products: try: product_record = dict(zip(fieldnames, product)) solr.add(**product_record) count += 1 except Exception, e: log.exception(e) continue if count % log_index_status_chunks == 0: log.info("Indexed %d products in %s" % (count, datetime.now() - start))
def index_pages(): """index all the pages that are modeled in the database """ solr = SolrConnection(settings.SOLR) solr.delete_query('type:page') cursor = connection.cursor() cursor.execute("SELECT id FROM core_page") count = 0 while True: row = cursor.fetchone() if row is None: break page = models.Page.objects.get(id=row[0]) LOGGER.info("[%s] indexing page: %s", count, page.url) solr.add(**page.solr_doc) count += 1 if count % 100 == 0: reset_queries() solr.commit()
def index_pages(): """index all the pages that are modeled in the database """ _log = logging.getLogger(__name__) solr = SolrConnection(settings.SOLR) cursor = connection.cursor() cursor.execute( "SELECT id FROM core_page WHERE ocr_filename IS NOT NULL AND ocr_filename <> ''" ) count = 0 while True: row = cursor.fetchone() if row == None: break page = models.Page.objects.get(id=row[0]) _log.info("[%s] indexing page: %s" % (count, page.url)) solr.add(**page.solr_doc) count += 1 if count % 100 == 0: reset_queries() solr.commit()
"uuid": [u"78755d851f9a453b84a51b1c00c68553"], "depositor": "zool0982" # 'identifier': ['fri_day1'], # 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/fri_day1/df_manifest.rdf'], # 'mediator': ['admin'], # 'text': ['', 'http://vocab.ox.ac.uk/projectfunding#', '', 'seeking_approval', '', '', 'yes', '', ''], # 'depositor': ['zool0982'], # 'embargoedUntilDate': ['2083-06-21T14:08:45Z'], # 'alternative': ['fri_day1'], # 'subject': [''], # 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], # 'publisher': ['Bodleian Libraries, University of Oxford'], # 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], # 'language': [''], # 'title': ['fri_day1'], # 'embargoStatus': ['True'], # 'description': [''], # 'format': [''], # 'modified': ['2013-06-21 14:08:45.525602'], # 'currentVersion': ['2'], # 'created': ['2013-06-21 14:08:45.253033'], # 'issued': [''], # 'type': ['', 'http://vocab.ox.ac.uk/dataset/schema#DataSet'] } # solr_doc = {'identifier': ['fri_day1'], 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/fri_day1/df_manifest.rdf'], 'mediator': ['admin'], 'text': ['', '', '', 'http://vocab.ox.ac.uk/projectfunding#', '', 'yes', '', 'seeking_approval', ''], 'depositor': ['zool0982'], 'embargoedUntilDate': ['2083-06-21T14:08:45Z'], 'alternative': ['fri_day1'], 'subject': [''], 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], 'publisher': ['', 'Bodleian Libraries, University of Oxford'], 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], 'uuid': [u'4fb84512bfaf4927945ea3c241bf21c0'], 'language': [''], 'title': ['fri_day1'], 'embargoStatus': ['True'], 'description': [''], 'format': [''], 'modified': ['2013-06-21 14:08:45.525602'], 'id': ['fri_day1'], 'currentVersion': ['2'], 'created': ['2013-06-21 14:08:45.253033'], 'issued': [''], 'silo': ['DataFinder'], 'type': ['http://vocab.ox.ac.uk/dataset/schema#DataSet', '']} # solr_doc = {'identifier': ['mond_ay2'], 'aggregatedResource': ['http://datafinder-d2v.bodleian.ox.ac.uk/DataFinder/datasets/mond_ay2/df_manifest.rdf'], 'mediator': ['admin'], 'text': ['', 'http://vocab.ox.ac.uk/projectfunding#', '', 'seeking_approval', '', 'yes', '', '', ''], 'depositor': 'zool0982', 'alternative': ['mond_ay2'], 'embargoedUntilDate': ['2083-06-24T03:41:53Z'], 'subject': [''], 'rights': ['http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c'], 'publisher': ['Bodleian Libraries, University of Oxford'], 'license': ['CC0 1.0 Universal (CC0 1.0). See http://creativecommons.org/publicdomain/zero/1.0/legalcode'], 'uuid': [u'78755d851f9a453b84a51b1c00c68553'], 'language': [''], 'title': ['mond_ay2'], 'embargoStatus': ['True'], 'description': ['mond_ay2'], 'format': [''], 'modified': ['2013-06-24 03:41:53.988847'], 'id': ['mond_ay2'], 'currentVersion': ['2'], 'created': ['2013-06-24 03:41:53.618090'], 'issued': [''], 'silo': ['DataFinder'], 'type': ['', 'http://vocab.ox.ac.uk/dataset/schema#DataSet']} # print repr(solr_doc) solr.add(_commit=True, **solr_doc) solr.commit()
# 'modified': ['2013-06-21 08:56:50.308522'], # 'created': ['2013-06-21 08:56:50.049645'], # 'currentVersion': ['2'], # 'issued': [''], # 'silo': ['DataFinder'], # 'type': ['', 'http://vocab.ox.ac.uk/dataset/schema#DataSet'] # } # Get the df dictionary items to be added into solr df_solr_doc = gather_document(silo , uuid , item_id, df_graph ) (resp,respdata) = datastore.doHTTP_GET(resource="/" + silo +"/datasets/" + item_id +"/manifest.rdf") text_file = open("sample_redis_main_manifest.rdf", "w") text_file.write(respdata) text_file.close() main_graph=rdflib.Graph() try: with open("sample_redis_main_manifest.rdf", 'r') as f: main_graph.parse(f, base="sample_redis_manifest.rdf") except IOError, e: pass # Get the main dictionary items to be added into solr main_solr_doc = gather_document(silo , uuid , item_id, main_graph ) # Add the two dictionaries together main_solr_doc.update(df_solr_doc) print main_solr_doc solr_doc = solr.add(_commit=True, **(main_solr_doc)) solr.commit()
# Get rdf graph from manifest for dataset graph = None tries = 0 while tries < 5: response = db.getFile(silo_name, itemid, "manifest.rdf") if db.good(response): manifest = response.results graph = ConjunctiveGraph() graph.parse(StringIO(manifest), "xml") break else: tries += 1 if state_info and graph: solr_doc = gather_document(silo_name, itemid, graph, state_info, debug=True) try: solr.add(_commit=False, **solr_doc) except Exception, e: logger.error("Error adding document to solr id:%s in silo:%s\n" % (itemid, silo_name)) try: logger.error("%s\n\n" % str(e)) except: pass rq.task_failed() continue else: logger.error("Error gathering state and manifest info for id:%s in silo:%s\n" % (itemid, silo_name)) rq.task_failed() continue else: solr_doc = {"id": silo_name, "silo": silo_name, "type": "Silo", "uuid": silo_name} # Get state infor for silo
main_graph.parse(f, base="temp_main_manifest.rdf") f.close() except IOError, e: logger.info( str(e)) logger.info("IOERROR") pass main_solr_doc = gather_document(silo_name, uuid, itemid, main_graph) # Add the two dictionaries together main_solr_doc.update(df_solr_doc) logger.info('Solr_document = ') logger.info(main_solr_doc) try: solr.add(_commit=False, **main_solr_doc) except Exception, e : logger.error("Error adding document to solr id:%s in silo:%s\n" % (itemid, silo_name)) try: logger.error("%s\n\n" %str(e)) except: pass rq.task_failed() continue # else: # silo_metadata = g.describe_silo(silo_name) # solr_doc = {'id':silo_name, 'silo':silo_name, 'type':'Silo', 'uuid':uuid4().hex} # solr_doc['title'] = '' # if 'title' in silo_metadata: # solr_doc['title'] = silo_metadata['title'] # solr_doc['description'] = ''
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in [ "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml", ]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urlparse.urljoin(batch.storage_url, alias) try: urllib2.urlopen(url) validated_batch_file = alias break except (urllib2.HTTPError, urllib2.URLError): continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): # if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) # b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ LOGGER.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): LOGGER.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) LOGGER.info("Batch already loaded: %s", batch_name) return batch except Batch.DoesNotExist as e: pass LOGGER.info("loading batch: %s", batch_name) event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath("ndnp:reel", namespaces=ns): reel_number = e.attrib["reelNumber"].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath("ndnp:issue", namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue, pages = self._load_issue(mets_url) except ValueError as e: LOGGER.exception("Unable to load issue from %s", mets_url) continue # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: LOGGER.info("Adding pages to solr index from issue %s", issue.title) for page in pages: LOGGER.debug("indexing ocr for: %s", page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() if self.PROCESS_OCR: LOGGER.info("Committing solr index") self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count LOGGER.info(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception: LOGGER.exception( "Unable to purge batch %s after loading failed", batch_name) raise BatchLoaderException(msg) if settings.IS_PRODUCTION: batch.released = datetime.now() batch.save() cache.delete("newspaper_info") return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: parts = batch_name.split("_", 3) if len(parts) == 4: parts = parts[1:] awardee_org_code, name_part, version = parts batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist: msg = "no awardee for org code: %s" % awardee_org_code LOGGER.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): LOGGER.debug("parsing issue mets file: %s", mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib["DMDID"] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int( mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath("string(.//mods:dateIssued)", namespaces=ns) issue.date_issued = datetime.strptime(date_issued, "%Y-%m-%d") # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except title.DoesNotExist: url = "https://chroniclingamerica.loc.gov/lccn/%s/marc.xml" % lccn LOGGER.info("attempting to load MARC record from %s", url) management.call_command("load_titles", url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() LOGGER.debug("saved issue: %s", issue.url) notes = [] for mods_note in mods.xpath(".//mods:note", namespaces=ns): type = mods_note.xpath("string(./@type)") label = mods_note.xpath("string(./@displayLabel)") text = mods_note.xpath("string(.)") note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes = notes issue.save() # attach pages: lots of logging because it's expensive pages = [] for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: pages.append(self._load_page(doc, page_div, issue)) except BatchLoaderException: LOGGER.exception( "Failed to load page. doc: %s, page div: %s, issue: %s", doc, page_div, issue) return issue, pages def _load_page(self, doc, div, issue): dmdid = div.attrib["DMDID"] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath("string(.//mods:extent/mods:start)", namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warning("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath(".//mods:note", namespaces=ns): type = mods_note.xpath("string(./@type)") label = mods_note.xpath("string(./@displayLabel)") text = mods_note.xpath("string(.)").strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath("./mets:fptr", namespaces=ns): file_id = fptr.attrib["FILEID"] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib["USE"] # get the filename relative to the storage location file_name = file_el.xpath("string(./mets:FLocat/@xlink:href)", namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == "master": page.tiff_filename = file_name elif file_type == "service": page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib["ADMID"].split(" "): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page, ) if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == "derivative": page.pdf_filename = file_name elif file_type == "ocr": page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page def process_ocr(self, page): LOGGER.debug("extracting ocr text and word coords for %s", page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() lang_text_solr = {} for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get( Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: LOGGER.warning( "Language %s does not exist in the database. Defaulting to English.", lang) # default to english as per requirement language = models.Language.objects.get(code="eng") ocr.language_texts.create(language=language) lang_text_solr[language.code] = text page.ocr = ocr page.lang_text = lang_text_solr page.save() return page def _process_coordinates(self, page, coords): LOGGER.debug("writing out word coords for %s", page.url) # We'll use a temporary file in case the coordinates dir is configured # to a network filesystem which has poor update performance # characteristics fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE) f = open(path, "w") f.write(gzip_compress(json.dumps(coords))) f.close() os.close(fd) final_path = models.coordinates_path(page._url_parts()) try: shutil.move(path, final_path) except Exception: LOGGER.warning( 'Could not move coordinates to "%s". Waiting 5 seconds before trying again…', final_path) time.sleep(5) shutil.move(path, final_path) def process_coordinates(self, batch_path): LOGGER.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: LOGGER.warning( "Batch [%s] page [%s] has no OCR; skipping coordinates processing", batch_name, page, ) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) LOGGER.debug("Extracting OCR from url %s", url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e LOGGER.exception(msg) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, "") return rel_path @transaction.atomic def purge_batch(self, batch_name): batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name) except Batch.DoesNotExist: LOGGER.info("Batch %s does not exist", batch_name) return event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.prefetch_related("pages__issue", "pages__issue__title"): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
def solr_index(med): lists = [] countries = [] sections = [] subsections = [] pharma_form_list = [] pharma_form_type_list = [] category_list = [] observation_list = [] # if medicine status is not active delete from solr index if not med.active: try: solr = SolrConnection(settings.SOLR_URL) solr.delete(id=str(med.id)) response = solr.commit() except Exception as ex: return False return True # index medicine on solr index medicine_translations = MedicineLocal.objects.filter(medicine=med.id) medicine_list = ['en^%s' % med.name.strip()] for translation in medicine_translations: medicine_list.append('%s^%s' % (translation.language, translation.name.strip())) medicine_list = "|".join(medicine_list) # ex.: en^codeine|pt-br^codeína|es^codeína # retrieve actives pharmaceutical forms of currente medicine pharm_forms = med.pharmaceuticalform_set.filter(active=True) for form in pharm_forms: # ex. ^enTablet|es^Tableta|pt-br^Comprimido pharma_form_type_translations = "|".join( form.pharmaceutical_form_type.get_translations() ) pharma_form_type_list.append(pharma_form_type_translations) # ex. ^enTablet|es^Tableta|pt-br^Comprimido|comp^15 mg/ml pharma_form_list.append('%s|comp^%s' % (pharma_form_type_translations, form.composition)) # create category_list (section and subsection where current pharmaceutical form is used on lists) section_pharm_form_list = SectionPharmForm.objects.filter(pharmaceutical_form=form) for section_pharm_form in section_pharm_form_list: #add observations of current section_pharm_form if section_pharm_form.only_for_children: observation_list.append('only_for_children') if section_pharm_form.specialist_care_for_children: observation_list.append('specialist_care_for_children') if section_pharm_form.restriction_age: observation_list.append('restriction_age') if section_pharm_form.best_evidence: observation_list.append('best_evidence') if section_pharm_form.observation: observation_list.append('observation') section = Section.objects.get(pk=section_pharm_form.section.id) section_translations = "|".join(section.get_translations()) section_tree = section.get_ancestors() if section_tree: for sec in section_tree: category_translations = "|".join(sec.get_translations()) if category_translations not in category_list: category_list.append(category_translations) if section_translations not in category_list: category_list.append(section_translations) list_associated = "|".join( section.list.get_translations() ) if section.list.type == 'c': if list_associated not in countries: countries.append(list_associated) else: if list_associated not in lists: lists.append(list_associated) #check if current medicine have Evidence summaries has_evidence = None evidence_total = MedicineEvidenceSummary.objects.filter(medicine=med.id).count() if evidence_total > 0: has_evidence = "true" # try to create a connection to a solr server and send medicine try: solr = SolrConnection(settings.SOLR_URL) solr.add( id = str(med.id), type = "medicine", name = medicine_list, pharmaceutical_form = pharma_form_list, pharmaceutical_form_type = pharma_form_type_list, list=lists, country=countries, category=category_list, observation=observation_list, has_evidence=has_evidence, ) response = solr.commit() except Exception as ex: return False return True
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in ["batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml"]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urlparse.urljoin(batch.storage_url, alias) try: urllib2.urlopen(url) validated_batch_file = alias break except (urllib2.HTTPError, urllib2.URLError): continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): # if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) #b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ LOGGER.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): LOGGER.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) LOGGER.info("Batch already loaded: %s", batch_name) return batch except Batch.DoesNotExist as e: pass LOGGER.info("loading batch: %s", batch_name) event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue, pages = self._load_issue(mets_url) except ValueError as e: LOGGER.exception(e) continue # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: LOGGER.info("Adding pages to solr index from issue %s", issue.title) for page in pages: LOGGER.debug("indexing ocr for: %s", page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() if self.PROCESS_OCR: LOGGER.info("Committing solr index") self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count LOGGER.info(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: LOGGER.error("purge batch failed for failed load batch: %s", pbe) LOGGER.exception(pbe) raise BatchLoaderException(msg) if settings.IS_PRODUCTION: batch.released = datetime.now() batch.save() return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: parts = batch_name.split("_", 3) if len(parts) is 4: parts = parts[1:] awardee_org_code, name_part, version = parts batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist: msg = "no awardee for org code: %s" % awardee_org_code LOGGER.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): LOGGER.debug("parsing issue mets file: %s", mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int(mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns) issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d') # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except Exception as e: url = 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn LOGGER.info("attempting to load marc record from %s", url) management.call_command('load_titles', url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() LOGGER.debug("saved issue: %s", issue.url) notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)') note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes = notes issue.save() # attach pages: lots of logging because it's expensive pages = [] for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: pages.append(self._load_page(doc, page_div, issue)) except BatchLoaderException as e: LOGGER.error("Failed to load page. doc: %s, page div: %s, issue: %s", doc, page_div, issue) LOGGER.exception(e) return issue, pages def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath( 'string(.//mods:detail[@type="page number"])', namespaces=ns ).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns ).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warn("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info("Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page) if not page.jp2_width: raise BatchLoaderException("No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException("No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page def process_ocr(self, page): LOGGER.debug("extracting ocr text and word coords for %s", page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() lang_text_solr = {} for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: LOGGER.warn("Language %s does not exist in the database. Defaulting to English.", lang) # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language) lang_text_solr[language.code] = text page.ocr = ocr page.lang_text = lang_text_solr page.save() return page def _process_coordinates(self, page, coords): LOGGER.debug("writing out word coords for %s", page.url) fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE) # get a temp file in case the coordinates dir is a NFS or S3 mount which have poor multiple write performance f = open(path, "w") f.write(gzip_compress(json.dumps(coords))) f.close() os.close(fd) final_path = models.coordinates_path(page._url_parts()) try: shutil.move(path, final_path) except Exception: LOGGER.warn("Could not move coordinates to [%s]. Waiting 5 seconds and trying again in case of network mount", final_path) time.sleep(5) shutil.move(path, final_path) def process_coordinates(self, batch_path): LOGGER.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: LOGGER.warn("Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page." % (batch_name, page)) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) LOGGER.debug("Extracting OCR from url %s", url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e LOGGER.exception(msg) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, '') return rel_path @transaction.atomic def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.all(): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr if self.PROCESS_OCR: self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in [ "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml" ]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urllib.parse.urljoin(batch.storage_url, alias) try: u = urllib.request.urlopen(url) validated_batch_file = alias break except urllib.error.HTTPError as e: continue except urllib.error.URLError as e: continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): #if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) #b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ self.pages_processed = 0 # Trailing slash breaks comparison to link_name below, so strip off batch_path = batch_path.rstrip("/") _logger.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) # Create symlink if paths don't match, symlink not already there, # and batch_path wasn't input with a BATCH_STORAGE symlink path if (batch_path != link_name and not os.path.islink(link_name) and not (os.path.islink(settings.BATCH_STORAGE) and batch_path.startswith( os.path.realpath(settings.BATCH_STORAGE)))): _logger.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist as e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urllib.parse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError as e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed)) # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count event = LoadBatchEvent(batch_name=batch_name, message=msg) _logger.info(msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: _logger.error("purge batch failed for failed load batch: %s" % pbe) _logger.exception(pbe) raise BatchLoaderException(msg) # updates the min and max years of all titles set_fulltext_range() return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: _, org_code, name_part, version = batch_name.split("_", 3) awardee_org_code = org_code batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist as e: msg = "no awardee for org code: %s" % awardee_org_code _logger.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): _logger.debug("parsing issue mets file: %s" % mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int( mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns) issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d') # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except Exception as e: url = settings.MARC_RETRIEVAL_URLFORMAT % lccn _logger.info("attempting to load marc record from %s", url) management.call_command('load_titles', url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() _logger.debug("saved issue: %s" % issue.url) notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)') note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes.set(notes, bulk=False) issue.save() # attach pages: lots of logging because it's expensive for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: page = self._load_page(doc, page_div, issue) self.pages_processed += 1 except BatchLoaderException as e: _logger.exception(e) return issue def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath('string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError as e: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist as e: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: _logger.warn("unable to find reel number in page metadata") _logger.info("Assigned page sequence: %s" % page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue _logger.info("Saving page. issue date: %s, page sequence: %s" % (issue.date_issued, page.sequence)) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes.set(notes, bulk=False) # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urllib.parse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError as e: _logger.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder..." % (page.issue, page)) im = Image.open(page.jp2_abs_filename) page.jp2_width, page.jp2_length = im.size if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: self.process_ocr(page) else: _logger.info("No ocr filename for issue: %s page: %s" % (page.issue, page)) _logger.debug("saving page: %s" % page.url) page.save() return page def process_ocr(self, page, index=True): _logger.debug("extracting ocr text and word coords for %s" % page.url) url = urllib.parse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() for lang, text in lang_text.items(): try: language = models.Language.objects.get( Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language, text=text) page.ocr = ocr if index: _logger.debug("indexing ocr for: %s" % page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() def _process_coordinates(self, page, coords): _logger.debug("writing out word coords for %s" % page.url) f = open(models.coordinates_path(page._url_parts()), "wb") f.write(gzip_compress(json.dumps(coords).encode('utf-8'))) f.close() def process_coordinates(self, batch_path): _logger.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): url = urllib.parse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e _logger.error(msg) _logger.exception(e) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, '') return rel_path def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): _logger.info("Removing symlink %s", link_name) os.remove(link_name) # updates the min and max years of all titles set_fulltext_range() except Exception as e: msg = "purge failed: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.all(): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) reset_queries() issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
def solr_index(med): lists = [] countries = [] sections = [] subsections = [] pharma_form_list = [] pharma_form_type_list = [] category_list = [] observation_list = [] # if medicine status is not active delete from solr index if not med.active: try: solr = SolrConnection(settings.SOLR_URL) solr.delete(id=str(med.id)) response = solr.commit() except Exception as ex: return False return True # index medicine on solr index medicine_translations = MedicineLocal.objects.filter(medicine=med.id) medicine_list = ['en^%s' % med.name.strip()] for translation in medicine_translations: medicine_list.append('%s^%s' % (translation.language, translation.name.strip())) medicine_list = "|".join( medicine_list) # ex.: en^codeine|pt-br^codeína|es^codeína # retrieve actives pharmaceutical forms of currente medicine pharm_forms = med.pharmaceuticalform_set.filter(active=True) for form in pharm_forms: # ex. ^enTablet|es^Tableta|pt-br^Comprimido pharma_form_type_translations = "|".join( form.pharmaceutical_form_type.get_translations()) pharma_form_type_list.append(pharma_form_type_translations) # ex. ^enTablet|es^Tableta|pt-br^Comprimido|comp^15 mg/ml pharma_form_list.append( '%s|comp^%s' % (pharma_form_type_translations, form.composition)) # create category_list (section and subsection where current pharmaceutical form is used on lists) section_pharm_form_list = SectionPharmForm.objects.filter( pharmaceutical_form=form) for section_pharm_form in section_pharm_form_list: #add observations of current section_pharm_form if section_pharm_form.only_for_children: observation_list.append('only_for_children') if section_pharm_form.specialist_care_for_children: observation_list.append('specialist_care_for_children') if section_pharm_form.restriction_age: observation_list.append('restriction_age') if section_pharm_form.best_evidence: observation_list.append('best_evidence') if section_pharm_form.observation: observation_list.append('observation') section = Section.objects.get(pk=section_pharm_form.section.id) section_translations = "|".join(section.get_translations()) section_tree = section.get_ancestors() if section_tree: for sec in section_tree: category_translations = "|".join(sec.get_translations()) if category_translations not in category_list: category_list.append(category_translations) if section_translations not in category_list: category_list.append(section_translations) list_associated = "|".join(section.list.get_translations()) if section.list.type == 'c': if list_associated not in countries: countries.append(list_associated) else: if list_associated not in lists: lists.append(list_associated) #check if current medicine have Evidence summaries has_evidence = None evidence_total = MedicineEvidenceSummary.objects.filter( medicine=med.id).count() if evidence_total > 0: has_evidence = "true" # try to create a connection to a solr server and send medicine try: solr = SolrConnection(settings.SOLR_URL) solr.add( id=str(med.id), type="medicine", name=medicine_list, pharmaceutical_form=pharma_form_list, pharmaceutical_form_type=pharma_form_type_list, list=lists, country=countries, category=category_list, observation=observation_list, has_evidence=has_evidence, ) response = solr.commit() except Exception as ex: return False return True