def capitales_etudiee_oecd():
    '''
    Enable to retrieve OECD countries capitals through CountryInfo() method.
    This is useful for OpenWeatherMap informations extracting.

    ==> Countries capitals are return in a dictionnary as follows:
    {
    ...
    'SRI LANKA': 'Colombo',
    'SUDAN': 'Khartoum', etc
    ...
    }

    '''
    with open('app/static/json/city.list.json') as f:
        data = json.load(f)
    capitales_oecd = {}
    nom_pays_ocde_majuscules = []
    nom_pays_open_weather_majuscules = []
    pays_communs_pour_projet = []

    for pays in nom_des_pays:
        nom_pays_ocde_majuscules.append(pays.upper())
    infos_pays = CountryInfo()
    infos_pays = infos_pays.all()
    infos_pays = dict(infos_pays)

    for pays in infos_pays.keys():
        nom_pays_open_weather_majuscules.append(pays.upper())

    ### Creation of a set in order to find similar cities within lists
    pays_ocde = set(nom_pays_ocde_majuscules)
    pays_open_weather = set(nom_pays_open_weather_majuscules)
    pays_communs_pour_projet = pays_ocde.intersection(pays_open_weather)

    ### Converting set in list
    pays_communs_pour_projet = list(pays_communs_pour_projet)
    pays_communs_pour_projet.sort(reverse=False)

    for pays in pays_communs_pour_projet:
        country = CountryInfo(pays)
        # capitale = country.capital().upper()
        capitale = country.capital()
        capitales_oecd[pays] = capitale

    return capitales_oecd
예제 #2
0
def geo_lng(dataloader, args):
    mappings = pickle.load(open('util_files/country_lang_mappings.pkl', 'rb'))
    iso3_to_lang = mappings['iso3_to_lang']
    # Country to iso3 mappings that are missing
    missing = {'South+Korea': 'KOR',
            'North+Korea': 'PRK',
            'Laos': 'LAO',
            'Caribbean+Netherlands': 'BES',
            'St.+Lucia': 'LCA',
            'East+Timor': 'TLS',
            'Democratic+Republic+of+Congo': 'COD',
            'Swaziland': 'SWZ',
            'Cape+Verde': 'CPV',
            'C%C3%B4te+d%C2%B4Ivoire': 'CIV',
            'Ivory+Coast': 'CIV',
            'Channel+Islands': 'GBR'
            }


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model = models.alexnet(pretrained=True).to(device)
    new_classifier = nn.Sequential(*list(model.classifier.children())[:-1])
    model.classifier = new_classifier

    with_country = dataloader.dataset.with_country

    country_with_langs = {}
    country_with_imgs = {} # for each country, first list is tourist second is local
    lang_counts = {}

    detecter = fasttext.load_model('util_files/lid.176.bin')
    lang_dict = {}
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    for i, (data, target) in enumerate(tqdm(dataloader)):
        if data is None:
            continue
        this_tags = [tag['label'] for tag in target[0] if len(tag['label']) >= 3]
        if len(this_tags) > 0:
            srcz = []
            conf = []
            for tag in this_tags:
                classify = detecter.predict(tag)
                srcz.append(classify[0][0][9:])
                conf.append(classify[1][0])

            # Pick out the most common language
            commons = Counter(srcz).most_common()
            the_src = commons[0][0]
            # If the most common language is English, look at the second most common language
            # since people oftentimes use English even when it's not their native language
            if the_src == 'en' and len(commons) > 1:
                the_src_maybe = commons[1][0]
                words = [i for i in range(len(srcz)) if srcz[i] == the_src_maybe]
                # If this second most common language has been classified with more than .5
                # probability, then choose this language for the image
                for word in words:
                    if conf[word] > .5: 
                        the_src = the_src_maybe
            if the_src in lang_counts.keys():
                lang_counts[the_src] += 1
            else:
                lang_counts[the_src] = 1

            country = target[2][0]
            iso3 = None
            local = None
            try:
                iso3 = pycountry.countries.search_fuzzy(country.replace('+', ' '))[0].alpha_3
            except LookupError:
                iso3 = missing[country]
            try:
                country_info = CountryInfo(country.replace('+', ' ')).info()
            except KeyError:
                country_info = {}
            country_name = country.split('+')
            if 'name' in country_info.keys():
                country_name += country_info['name']
            if 'nativeName' in country_info.keys():
                country_name += country_info['nativeName']

            # When comparing images to distinguish between tourist and local, we further look into the content of the tags,
            # allowing some images to be categorized as 'unknown' if we are not that sure if it's tourist or local

            # Local: in a local language, country's name isn't a tag, and 'travel' isn't a tag
            # Tourist: in a non-local language, or 'travel' is a tag
            try:
                if the_src in iso3_to_lang[iso3] and len(set(country_name)&set(this_tags)) == 0 and 'travel' not in this_tags:
                    local = 1
                elif the_src not in iso3_to_lang[iso3] or 'travel' in this_tags:
                    local = 0
            except KeyError:
                 print("This iso3 can't be found in iso3_to_lang: {}".format(iso3))

            if country not in country_with_langs.keys():
                country_with_langs[country] = []
                country_with_imgs[country] = [[], []]
            country_with_langs[country].append(the_src)
            if local is not None:
                if len(country_with_imgs[country][local]) < 500:
                    data = normalize(data).to(device)
                    big_data = F.interpolate(data.unsqueeze(0), size=224, mode='bilinear').to(device)
                    this_features = model.forward(big_data)
                    country_with_imgs[country][local].append((this_features.data.cpu().numpy(), target[3]))


    info = {}
    info['lang_counts'] = lang_counts
    info['country_with_langs'] = country_with_langs
    info['country_with_imgs'] = country_with_imgs

    pickle.dump(info, open("results/{}/geo_lng.pkl".format(args.folder), "wb"))