my_ana = StemmingAnalyzer() | CharsetFilter(accent_map)
	# this tokenizer allows for searching on full IDs as well as on components between : chars
	# however, in search there is a problem with searching for : chars - escaping does not work, hence introduced the hack below to replace : with ?
	# uncommenting LoggingFilter() and running the script with -d allows for debugging the tokenization
	my_id_ana = RegexTokenizer(expression=re.compile('[^ ]+')) | LowercaseFilter() | TeeFilter(PassFilter(), IntraWordFilter(delims=u':',splitnums=False) | StopFilter(stoplist=frozenset(['bbmri-eric', 'id', 'contactid', 'networkid', 'collection']))) # | LoggingFilter()
	schema = Schema(id=TEXT(stored=True,analyzer=my_id_ana), type=STORED, name=TEXT(stored=True,analyzer=my_ana), acronym=ID, description=TEXT(analyzer=my_ana), address=TEXT(analyzer=my_ana), phone=TEXT, email=TEXT, juridical_person=TEXT(analyzer=my_ana), bioresource_reference=TEXT, head_name=TEXT(analyzer=my_ana),contact_id=TEXT(analyzer=my_id_ana))
	ix = create_in(indexdir, schema)
	writer = ix.writer()

	def getFullName(entity):
		return " ".join(filter(None,[entity.get('head_title_before_name'), entity.get('head_firstname'), entity.get('head_lastname'), entity.get('head_title_after_name')]))

	for collection in dir.getCollections():
		log.debug("Analyzing collection " + collection['id'])
		biobankId = dir.getCollectionBiobankId(collection['id'])
		biobank = dir.getBiobankById(biobankId)
		contactId = None
		if 'contact' in collection:
			contactId = collection['contact']['id']
		elif 'contact' in biobank:
			contactId = biobank['contact']['id']
		writer.add_document(id=collection['id'], type=u"COLLECTION", name=collection.get('name'), description=collection.get('description'), acronym=collection.get('acronym'), bioresource_reference=collection.get('bioresource_reference'), head_name=getFullName(collection), contact_id=contactId)

	for biobank in dir.getBiobanks():
		log.debug("Analyzing biobank " + biobank['id'])
		contactId = None
		if 'contact' in biobank:
			contactId = biobank['contact']['id']
		writer.add_document(id=biobank['id'], type=u"BIOBANK", name=biobank.get('name'), description=biobank.get('description'), acronym=biobank.get('acronym'), juridical_person=biobank.get('juridical_person'), bioresource_reference=biobank.get('bioresource_reference'), head_name=getFullName(biobank), contact_id=contactId)
예제 #2
0
# Main code

dir = Directory(purgeCaches=args.purgeCaches, debug=args.debug, pp=pp)

log.info('Total biobanks: ' + str(dir.getBiobanksCount()))
log.info('Total collections: ' + str(dir.getCollectionsCount()))

if args.negotiator:
    contactsToCollections = {}
    collectionsToContacts = {}
    contactsToEmails = {}

    for collection in dir.getCollections():
        log.debug("Analyzing collection " + collection['id'])
        collectionId = collection['id']
        biobankId = dir.getCollectionBiobankId(collection['id'])
        biobank = dir.getBiobankById(biobankId)
        if 'contact' in collection:
            contactId = collection['contact']['id']
            contactEmail = collection['contact']['email']
            if contactId not in contactsToEmails:
                contactsToEmails[contactId] = contactEmail
            else:
                if (contactsToEmails[contactId] != contactEmail):
                    log.error(
                        "Contact mismatch for %s: previously provided <%s>, now provided <%s>"
                        %
                        (contactId, contactsToEmails[contactId], contactEmail))
            log.debug("   collection %s maps to %s <%s>" %
                      (collectionId, contactId, contactEmail))
            if contactId in contactsToCollections:
# Main code

dir = Directory(purgeCaches=args.purgeCaches, debug=args.debug, pp=pp)

log.info('Total biobanks: ' + str(dir.getBiobanksCount()))
log.info('Total collections: ' + str(dir.getCollectionsCount()))

countryBiobanks = {}
countryBiobanksWithCollections = {}
countryCollections = {}

for collection in dir.getCollections():
	collectionId = collection['id']
	log.debug("Analyzing collection " + collectionId)
	biobankId = dir.getCollectionBiobankId(collectionId)
	biobank = dir.getBiobankById(biobankId)
	NN = dir.getBiobankNN(biobankId)
	if not NN in countryBiobanks:
		countryBiobanks[NN] = set()
	if not NN in countryBiobanksWithCollections:
		countryBiobanksWithCollections[NN] = set()
	if not NN in countryCollections:
		countryCollections[NN] = set()
	countryBiobanks[NN].add(biobankId)
	countryBiobanksWithCollections[NN].add(biobankId)
	countryCollections[NN].add(collectionId)
	
for biobank in dir.getBiobanks():
	biobankId = biobank['id']
	NN = dir.getBiobankNN(biobankId)