def get_codes(self): if self.codes: return self.codes self.codes = CODES.copy() templates = [] try: if self.name: templates = wtp.data_from_templates(self.name, lang='it') except Exception as e: logger.error(e) ac_template = [t for t in templates if normalize_string(t['name']) == \ 'controllo_di_autorità'] ac_data = ac_template[0]['data'] if ac_template else {} # logger.debug('ac_data: {data}'.format(data=ac_data)) if ac_data.get('VIAF') is not None: self.codes['viaf'] = ac_data['VIAF'].encode('utf-8') if ac_data.get('SBN') is not None: self.codes['sbn'] = ac_data['SBN'].encode('utf-8') if ac_data.get('LCCN') is not None: self.codes['lccn'] = ac_data['LCCN'].encode('utf-8') return self.codes
logger.debug('no. of keys already collected: {no}'.format( no=len(set(wikipedia.keys())))) logger.debug('no. of pages in it.wiki with authority control, still to get: {no}'.format( no=len(wikipages_to_get))) count = 0 for page in wikipages_to_get: count += 1 logger.debug(count) viaf_code = None sbn_code = None templates = [] try: templates = wtp.data_from_templates(page, lang='it') except: pass ac_template = [t for t in templates if normalize_string(t['name']) == 'controllo_di_autorità'] ac_data = ac_template[0]['data'] if ac_template else {} logger.debug('page: %s, ac_data: %s' % (page, ac_data)) if ac_data.get('VIAF') is not None: logger.debug('VIAF from template') viaf_code = ac_data['VIAF'] if ac_data.get('SBN') is not None: logger.debug('SBN from template') sbn_code = ac_data['SBN']
def analyze_templates(self): logger.debug(self.page) finalplaces = list() types = list() fathers = list() try: templates = wtp.data_from_templates(self.page, self.lang) except ValueError: templates = [] logger.debug(templates) for t in templates: name = self._treat(t['name']) if name in TEMPLATES_TO_ANALYZE_IT.keys(): attributes = TEMPLATES_TO_ANALYZE_IT[name] tdata = {self._treat(k): v.lower() for k, v in t['data'].iteritems()} logger.debug(tdata) locations = [tdata[attr] for attr in attributes if (attr in tdata and tdata[attr] != '')] logger.debug(locations) for place in locations: logger.debug(place) place = place.replace('italia', '') place = place.strip().strip(',') if PIXELS.search(place): start = PIXELS.search(place).start() stop = PIXELS.search(place).end() logger.debug(start) logger.debug(stop) place = place[stop:].strip() if CAP.search(place): split = CAP.split(place) split = [s.strip() for s in split if s.strip() != ''] not_address = self._find_index(split) place = split[not_address] if PARENTHESIS.search(place): start = PARENTHESIS.search(place).start() stop = PARENTHESIS.search(place).end() place = place[:start] + place[stop:] if CURLY.search(place): start = CURLY.search(place).start() stop = CURLY.search(place).end() place = place[:start] + place[stop:] if ',' in place: split = place.split(',') split = [s.strip() for s in split if s.strip() != ''] not_address = self._find_index(split) logger.debug(not_address) place = split[not_address] place = place.split(' - ')[0] place = place.strip(',').strip() place = place.replace('[', '').replace(']', '') logger.debug(place) reconres = NR.query(query=place) if not reconres: if DI.search(place): place = DI.split(place)[-1] reconres = NR.query(query=place) for r in reconres: types = self._get_types(reconres) fathers = self._get_fathers(reconres) logger.debug('place: name={name}, types={types}'.format( name=place.encode('utf-8'), types=types)) if set(lau for lau, id_ in types).intersection(ALLOWEDTYPES): place = PlaceCandidate(name=place.encode('utf-8').title(), fathers = fathers) place_type = place.set_type_from_candidates(types) place.set_id_from_candidates(types, place_type) finalplaces.append(place) for cand in finalplaces: cand.score = 1.0/float(len(finalplaces)) return finalplaces