my_ana = StemmingAnalyzer() | CharsetFilter(accent_map) # this tokenizer allows for searching on full IDs as well as on components between : chars # however, in search there is a problem with searching for : chars - escaping does not work, hence introduced the hack below to replace : with ? # uncommenting LoggingFilter() and running the script with -d allows for debugging the tokenization my_id_ana = RegexTokenizer(expression=re.compile('[^ ]+')) | LowercaseFilter() | TeeFilter(PassFilter(), IntraWordFilter(delims=u':',splitnums=False) | StopFilter(stoplist=frozenset(['bbmri-eric', 'id', 'contactid', 'networkid', 'collection']))) # | LoggingFilter() schema = Schema(id=TEXT(stored=True,analyzer=my_id_ana), type=STORED, name=TEXT(stored=True,analyzer=my_ana), acronym=ID, description=TEXT(analyzer=my_ana), address=TEXT(analyzer=my_ana), phone=TEXT, email=TEXT, juridical_person=TEXT(analyzer=my_ana), bioresource_reference=TEXT, head_name=TEXT(analyzer=my_ana),contact_id=TEXT(analyzer=my_id_ana)) ix = create_in(indexdir, schema) writer = ix.writer() def getFullName(entity): return " ".join(filter(None,[entity.get('head_title_before_name'), entity.get('head_firstname'), entity.get('head_lastname'), entity.get('head_title_after_name')])) for collection in dir.getCollections(): log.debug("Analyzing collection " + collection['id']) biobankId = dir.getCollectionBiobankId(collection['id']) biobank = dir.getBiobankById(biobankId) contactId = None if 'contact' in collection: contactId = collection['contact']['id'] elif 'contact' in biobank: contactId = biobank['contact']['id'] writer.add_document(id=collection['id'], type=u"COLLECTION", name=collection.get('name'), description=collection.get('description'), acronym=collection.get('acronym'), bioresource_reference=collection.get('bioresource_reference'), head_name=getFullName(collection), contact_id=contactId) for biobank in dir.getBiobanks(): log.debug("Analyzing biobank " + biobank['id']) contactId = None if 'contact' in biobank: contactId = biobank['contact']['id'] writer.add_document(id=biobank['id'], type=u"BIOBANK", name=biobank.get('name'), description=biobank.get('description'), acronym=biobank.get('acronym'), juridical_person=biobank.get('juridical_person'), bioresource_reference=biobank.get('bioresource_reference'), head_name=getFullName(biobank), contact_id=contactId)
# Main code dir = Directory(purgeCaches=args.purgeCaches, debug=args.debug, pp=pp) log.info('Total biobanks: ' + str(dir.getBiobanksCount())) log.info('Total collections: ' + str(dir.getCollectionsCount())) if args.negotiator: contactsToCollections = {} collectionsToContacts = {} contactsToEmails = {} for collection in dir.getCollections(): log.debug("Analyzing collection " + collection['id']) collectionId = collection['id'] biobankId = dir.getCollectionBiobankId(collection['id']) biobank = dir.getBiobankById(biobankId) if 'contact' in collection: contactId = collection['contact']['id'] contactEmail = collection['contact']['email'] if contactId not in contactsToEmails: contactsToEmails[contactId] = contactEmail else: if (contactsToEmails[contactId] != contactEmail): log.error( "Contact mismatch for %s: previously provided <%s>, now provided <%s>" % (contactId, contactsToEmails[contactId], contactEmail)) log.debug(" collection %s maps to %s <%s>" % (collectionId, contactId, contactEmail)) if contactId in contactsToCollections:
# Main code dir = Directory(purgeCaches=args.purgeCaches, debug=args.debug, pp=pp) log.info('Total biobanks: ' + str(dir.getBiobanksCount())) log.info('Total collections: ' + str(dir.getCollectionsCount())) countryBiobanks = {} countryBiobanksWithCollections = {} countryCollections = {} for collection in dir.getCollections(): collectionId = collection['id'] log.debug("Analyzing collection " + collectionId) biobankId = dir.getCollectionBiobankId(collectionId) biobank = dir.getBiobankById(biobankId) NN = dir.getBiobankNN(biobankId) if not NN in countryBiobanks: countryBiobanks[NN] = set() if not NN in countryBiobanksWithCollections: countryBiobanksWithCollections[NN] = set() if not NN in countryCollections: countryCollections[NN] = set() countryBiobanks[NN].add(biobankId) countryBiobanksWithCollections[NN].add(biobankId) countryCollections[NN].add(collectionId) for biobank in dir.getBiobanks(): biobankId = biobank['id'] NN = dir.getBiobankNN(biobankId)