Exemplo n.º 1
0
def get_page_for(name, expect_party):
    # Minuses and Umlauts can stay.  Hooray!
    urlish_name = name.replace(' ', '_')
    url = 'https://de.wikipedia.org/wiki/' + urlish_name
    if url in url_override:
        print('[WARN] Using override for ' + name)
        url = url_override[url]
    path = nice.get(url)
    soup = as_soup(path)
    if is_not_found(soup):
        if name in known_missing:
            print('[WARN] Not found (and whitelisted): ' + name)
        else:
            print('[ERR!] Unexpectedly not found: ' + name)
            raise AssertionError(name)
        return None
    disambig_url = get_disambiguated_url(soup, expect_party)
    if disambig_url is None:
        return url, soup
    url = disambig_url
    path = nice.get(url)
    soup = as_soup(path)
    if is_not_found(soup):
        # This really, really should not happen.
        # Let's hope that female politicians don't have 'Politikerin' as disambiguation.
        print('[ERR!] Confused about: ' + name)
        raise AssertionError(path)
    # This wouldn't even make sense, or at least there is hopefully only one
    # politician for each name.  Note that other parts of this toolchain fail
    # horribly in this case anyway.
    assert get_disambiguated_url(soup, expect_party) is None, 'name'
    return url, soup
Exemplo n.º 2
0
def get_page_for(bird):
    url_override = {
        # There's both the Familie and the Art.
        # The Familie has a better photo.
        'Kleiber': 'https://de.wikipedia.org/wiki/Kleiber_%28Familie%29',
    }
    de_name = bird['de_name']
    if de_name in url_override:
        url = url_override[de_name]
    else:
        # Minuses and Umlauts can stay.  Hooray!
        urlish_name = de_name.replace(' ', '_')
        url = 'https://de.wikipedia.org/wiki/' + urlish_name
    path = nice.get(url)
    soup = wikify_each.as_soup(path)
    if wikify_each.is_not_found(soup):
        print('[ERR!] Unexpectedly not found: ' + de_name)
        raise AssertionError(de_name)
    disambig_url = get_disambiguated_url(soup, bird['latin_name'])
    if disambig_url is None:
        return url, soup
    url = disambig_url
    path = nice.get(url)
    soup = wikify_each.as_soup(path)
    if wikify_each.is_not_found(soup):
        # This really, really should not happen.
        # Let's hope that female politicians don't have 'Politikerin' as disambiguation.
        print('[ERR!] Confused about: ' + de_name)
        raise AssertionError(path)
    # This wouldn't even make sense.
    assert soup.find('table', id='Vorlage_Begriffsklaerung') is None
    return url, soup
Exemplo n.º 3
0
def get_img_desc(img_desc_url):
    path = nice.get(img_desc_url)
    soup = as_soup(path)
    return {
        'copyright': parse_copyright(soup),
        'license': parse_license(soup),
        'url': parse_img_url(soup),
    }
def checkout(pid, fields):
    img_prefix = os.path.join(DIR_PREFIX, pid)
    dl_path = nice.get(fields['url'])
    freshest_path = dl_path

    # Provide '_raw' for intermediate processing
    raw_dst_path = img_prefix + '_raw.jpg'
    if fields.get('is_compressed'):
        with open(raw_dst_path, 'wb') as raw_fp:
            subprocess.run(['unzip', '-p', dl_path],
                           stdout=raw_fp, stderr=subprocess.PIPE, check=True)
        freshest_path = raw_dst_path
    else:
        # Need '../' to get out of 'preview/'
        os.symlink('../' + dl_path, raw_dst_path)

    # Something about digitally rotated images (Michael Grosse-Brömer, 154)
    # doesn't work as it should.
    inject = []
    if '154' in pid:
        inject = ['-rotate', '-90']

    # Provide ready-to-use image
    convert(freshest_path,
            '-resize', '330x330^',
            '-gravity', 'north',
            '-extent', '330x330',
            *inject,
            '-strip',
            img_prefix + '.jpg')

    if not CHOICE_MODE:
        # Provide thumbnail
        convert(freshest_path,
               '-thumbnail', '75x75^',
               '-gravity', 'north',
               '-extent', '75x75',
               *inject,
               img_prefix + '_t.jpg')
        # TODO: Use '-strip'.
        # Don't do it right now in order to
        # avoid blowing up 'heavy' even more.

    # Retract '_raw'
    os.remove(raw_dst_path)

    entry = {
        'pathToImage': pid + '.jpg',
        'pathToThumb': pid + '_t.jpg',
        'license': fields['license'],
    }
    if 'copyright' in fields:
        entry['copyright'] = fields['copyright']
    return entry
def checkout(pid, fields):
    img_prefix = os.path.join(DIR_PREFIX, pid)
    dl_path = nice.get(fields['url'])
    freshest_path = dl_path

    # Provide '_raw' for intermediate processing
    raw_dst_path = img_prefix + '_raw.jpg'
    if fields.get('is_compressed'):
        with open(raw_dst_path, 'wb') as raw_fp:
            subprocess.run(['unzip', '-p', dl_path],
                           stdout=raw_fp,
                           stderr=subprocess.PIPE,
                           check=True)
        freshest_path = raw_dst_path
    else:
        # Need '../' to get out of 'preview/'
        os.symlink('../' + dl_path, raw_dst_path)

    # Something about digitally rotated images (Michael Grosse-Brömer, 154)
    # doesn't work as it should.
    inject = []
    if '154' in pid:
        inject = ['-rotate', '-90']

    # Provide ready-to-use image
    convert(freshest_path, '-resize', '330x330^', '-gravity', 'north',
            '-extent', '330x330', *inject, '-strip', img_prefix + '.jpg')

    if not CHOICE_MODE:
        # Provide thumbnail
        convert(freshest_path, '-thumbnail', '75x75^', '-gravity', 'north',
                '-extent', '75x75', *inject, img_prefix + '_t.jpg')
        # TODO: Use '-strip'.
        # Don't do it right now in order to
        # avoid blowing up 'heavy' even more.

    # Retract '_raw'
    os.remove(raw_dst_path)

    entry = {
        'pathToImage': pid + '.jpg',
        'pathToThumb': pid + '_t.jpg',
        'license': fields['license'],
    }
    if 'copyright' in fields:
        entry['copyright'] = fields['copyright']
    return entry
Exemplo n.º 6
0
def crawl_pages_inplace(data):
    for entry in data:
        assert 'page_file' not in entry
        entry['page_file'] = nice.get(entry['page'])
Exemplo n.º 7
0
def construct_root(party, url):
    return {'party': party, 'link': url, 'filename': nice.get(url)}
Exemplo n.º 8
0
def checkout(bid, fields, has_drawing: bool):
    GRAVITY_OVERRIDE = {
        'buntspecht': 'north',
        'girlitz': 'north',
        'grauschnaepper': 'north',
        'mehlschwalbe': 'northeast',
        'amsel': 'west',
        'haussperling': 'west',
        'rauchschwalbe': 'west',
        'rotkehlchen': 'east',
        'weisskopfseeadler': 'west',
        'hausrotschwanz': 'east',
        'star': 'east',
        'stieglitz': 'west',
        'tuerkentaube': 'east',
    }
    gravity = 'center'
    if bid in GRAVITY_OVERRIDE:
        gravity = GRAVITY_OVERRIDE[bid]

    pubweb_prefix = os.path.join(checkout_hot_poli.DIR_PREFIX, bid)
    hot_prefix = os.path.join(HOT_DIR_PREFIX, bid)
    dl_path = nice.get(fields['url'])

    # Provide ready-to-use image for pubweb
    checkout_hot_poli.convert(
        dl_path,
        '-resize',
        RESOLUTION_PUBWEB + '^>',
        '-strip',
        # It shouldn't ever be necessary to actually cut down the image vertically.
        # However, the code should still do something reasonable.
        '-gravity',
        gravity,
        '-extent',
        RESOLUTION_PUBWEB + '>',
        pubweb_prefix + '.jpg')

    # Provide ready-to-use images for HoT
    checkout_hot_poli.convert(
        dl_path,
        '-resize',
        RESOLUTION_HOT + '^>',
        '-strip',
        # It shouldn't ever be necessary to actually cut down the image vertically.
        # However, the code should still do something reasonable.
        '-gravity',
        gravity,
        '-extent',
        RESOLUTION_HOT + '>',
        hot_prefix + '.jpg')
    if has_drawing:
        print(
            '[INFO] skipped {}-drawing.jpg: we have an actual drawing!'.format(
                bid))
    else:
        copyfile(hot_prefix + '.jpg', hot_prefix + '-drawing.jpg')

    entry = {
        'filename': bid + '.jpg',
        'license': fields['license'],
    }
    if 'copyright' in fields:
        entry['copyright'] = fields['copyright']

    return entry