def get_page_for(name, expect_party): # Minuses and Umlauts can stay. Hooray! urlish_name = name.replace(' ', '_') url = 'https://de.wikipedia.org/wiki/' + urlish_name if url in url_override: print('[WARN] Using override for ' + name) url = url_override[url] path = nice.get(url) soup = as_soup(path) if is_not_found(soup): if name in known_missing: print('[WARN] Not found (and whitelisted): ' + name) else: print('[ERR!] Unexpectedly not found: ' + name) raise AssertionError(name) return None disambig_url = get_disambiguated_url(soup, expect_party) if disambig_url is None: return url, soup url = disambig_url path = nice.get(url) soup = as_soup(path) if is_not_found(soup): # This really, really should not happen. # Let's hope that female politicians don't have 'Politikerin' as disambiguation. print('[ERR!] Confused about: ' + name) raise AssertionError(path) # This wouldn't even make sense, or at least there is hopefully only one # politician for each name. Note that other parts of this toolchain fail # horribly in this case anyway. assert get_disambiguated_url(soup, expect_party) is None, 'name' return url, soup
def get_page_for(bird): url_override = { # There's both the Familie and the Art. # The Familie has a better photo. 'Kleiber': 'https://de.wikipedia.org/wiki/Kleiber_%28Familie%29', } de_name = bird['de_name'] if de_name in url_override: url = url_override[de_name] else: # Minuses and Umlauts can stay. Hooray! urlish_name = de_name.replace(' ', '_') url = 'https://de.wikipedia.org/wiki/' + urlish_name path = nice.get(url) soup = wikify_each.as_soup(path) if wikify_each.is_not_found(soup): print('[ERR!] Unexpectedly not found: ' + de_name) raise AssertionError(de_name) disambig_url = get_disambiguated_url(soup, bird['latin_name']) if disambig_url is None: return url, soup url = disambig_url path = nice.get(url) soup = wikify_each.as_soup(path) if wikify_each.is_not_found(soup): # This really, really should not happen. # Let's hope that female politicians don't have 'Politikerin' as disambiguation. print('[ERR!] Confused about: ' + de_name) raise AssertionError(path) # This wouldn't even make sense. assert soup.find('table', id='Vorlage_Begriffsklaerung') is None return url, soup
def get_img_desc(img_desc_url): path = nice.get(img_desc_url) soup = as_soup(path) return { 'copyright': parse_copyright(soup), 'license': parse_license(soup), 'url': parse_img_url(soup), }
def checkout(pid, fields): img_prefix = os.path.join(DIR_PREFIX, pid) dl_path = nice.get(fields['url']) freshest_path = dl_path # Provide '_raw' for intermediate processing raw_dst_path = img_prefix + '_raw.jpg' if fields.get('is_compressed'): with open(raw_dst_path, 'wb') as raw_fp: subprocess.run(['unzip', '-p', dl_path], stdout=raw_fp, stderr=subprocess.PIPE, check=True) freshest_path = raw_dst_path else: # Need '../' to get out of 'preview/' os.symlink('../' + dl_path, raw_dst_path) # Something about digitally rotated images (Michael Grosse-Brömer, 154) # doesn't work as it should. inject = [] if '154' in pid: inject = ['-rotate', '-90'] # Provide ready-to-use image convert(freshest_path, '-resize', '330x330^', '-gravity', 'north', '-extent', '330x330', *inject, '-strip', img_prefix + '.jpg') if not CHOICE_MODE: # Provide thumbnail convert(freshest_path, '-thumbnail', '75x75^', '-gravity', 'north', '-extent', '75x75', *inject, img_prefix + '_t.jpg') # TODO: Use '-strip'. # Don't do it right now in order to # avoid blowing up 'heavy' even more. # Retract '_raw' os.remove(raw_dst_path) entry = { 'pathToImage': pid + '.jpg', 'pathToThumb': pid + '_t.jpg', 'license': fields['license'], } if 'copyright' in fields: entry['copyright'] = fields['copyright'] return entry
def crawl_pages_inplace(data): for entry in data: assert 'page_file' not in entry entry['page_file'] = nice.get(entry['page'])
def construct_root(party, url): return {'party': party, 'link': url, 'filename': nice.get(url)}
def checkout(bid, fields, has_drawing: bool): GRAVITY_OVERRIDE = { 'buntspecht': 'north', 'girlitz': 'north', 'grauschnaepper': 'north', 'mehlschwalbe': 'northeast', 'amsel': 'west', 'haussperling': 'west', 'rauchschwalbe': 'west', 'rotkehlchen': 'east', 'weisskopfseeadler': 'west', 'hausrotschwanz': 'east', 'star': 'east', 'stieglitz': 'west', 'tuerkentaube': 'east', } gravity = 'center' if bid in GRAVITY_OVERRIDE: gravity = GRAVITY_OVERRIDE[bid] pubweb_prefix = os.path.join(checkout_hot_poli.DIR_PREFIX, bid) hot_prefix = os.path.join(HOT_DIR_PREFIX, bid) dl_path = nice.get(fields['url']) # Provide ready-to-use image for pubweb checkout_hot_poli.convert( dl_path, '-resize', RESOLUTION_PUBWEB + '^>', '-strip', # It shouldn't ever be necessary to actually cut down the image vertically. # However, the code should still do something reasonable. '-gravity', gravity, '-extent', RESOLUTION_PUBWEB + '>', pubweb_prefix + '.jpg') # Provide ready-to-use images for HoT checkout_hot_poli.convert( dl_path, '-resize', RESOLUTION_HOT + '^>', '-strip', # It shouldn't ever be necessary to actually cut down the image vertically. # However, the code should still do something reasonable. '-gravity', gravity, '-extent', RESOLUTION_HOT + '>', hot_prefix + '.jpg') if has_drawing: print( '[INFO] skipped {}-drawing.jpg: we have an actual drawing!'.format( bid)) else: copyfile(hot_prefix + '.jpg', hot_prefix + '-drawing.jpg') entry = { 'filename': bid + '.jpg', 'license': fields['license'], } if 'copyright' in fields: entry['copyright'] = fields['copyright'] return entry