示例#1
0
def get_wc_jsons(data_path):
    """
    Function for finding all JSONs in event data that are world cup related.

    inputs
    ----
    data_path: relative path to where events data is stored

    returns
    -----
    wc_jsons: list of json names
    """
    countries = dict(countries_for_language('en'))
    country_names = list(countries.values()) + ['England']
    
    wc_jsons = []
    for r, d, f in os.walk(data_path):
        for f_ in f:
            try:
                with open(data_path + os.sep + f_) as f:
                    event_json = json.load(f)
                    for event in event_json[:2]:
                        team = (event['team']['name'])
                        if team in country_names:
                            wc_jsons.append(f_)
            except:
                with open(data_path + os.sep + f_, encoding='utf-8') as f:
                    event_json = json.load(f)
                    for event in event_json[:2]:
                        team = (event['team']['name'])
                        if team in country_names:
                            wc_jsons.append(f_)
    return wc_jsons
                        
示例#2
0
 def select_word(self, array_number):
     if array_number == 1:
         countries = dict(countries_for_language('en'))
         for i in countries.values():
             self.word_list.append(i)
         random_number = random.randint(0, len(self.word_list))
         random_word = self.word_list[random_number]
         return random_word.lower()
示例#3
0
文件: app.py 项目: brysadler/npiapp
def get_country_df(df):
    country = []
    count = []
    for k in dict(countries_for_language('en')).values():
        len_country = len(df[df['countries'].map(set([k.lower()]).issubset)])
        country.append(k.lower())
        count.append(len_country)
    return pd.DataFrame({'country': country, 'count': count})
示例#4
0
def signup():
    countries = dict(countries_for_language('en'))
    if request.method == "POST":
        name = request.form["name"]

        if name:
            return render_template('signup.html',
                                   name="Please fill out this field.")
    return render_template('signup.html', countries=countries)
示例#5
0
def translate_countries(lang="en"):
    try:
        return [
            country.lower()
            for country in list(dict(countries_for_language(lang)).values())
        ]

    except Exception as err:
        return translate_countries(lang="en")
示例#6
0
def submit(uname):
  if not 'user' in session:
    return redirect('/login')
  error = None
  country = dict(countries_for_language('en'))
  location = country.values()
  otype = ['preserved specimen', 'human observation', 'machine observation']

  if request.method == 'POST':
    kingdom = request.form['kingdom']
    phylum = request.form['phylum']
    org_class = request.form['org_class']
    order = request.form['order']
    family = request.form['family']
    genus = request.form['genus']
    species = request.form['species']

    seq_type = request.form['seq_type']
    bp = request.form['bp']
    seq = request.form['seq']
    acc_no = request.form['acc_no']
    title = request.form['title']
    doi = request.form['doi']
    author = request.form['author']
    journal = request.form['journal']
    volume = request.form['volume']
    issue = request.form['issue']
    journal_date = request.form['journal_date']
    page_from = request.form['page_from']
    page_to = request.form['page_to']
    time = request.form['time']
    occ_type = request.form['occ_type']
    location = request.form['location']
    latitude = request.form['latitude']
    longitude = request.form['longitude']
    sequence = 'sequence' in request.form
    occurrence = 'occurrence' in request.form

    cursor = g.conn.execute("SELECT * FROM Organism WHERE genus=%s AND species=%s", genus, species)
    org = []
    for result in cursor:
      org.append(result)
    cursor.close()
    if not org:
      g.conn.execute('INSERT INTO Organism VALUES(%s, %s, %s, %s, %s, %s, %s)', kingdom, phylum, org_class, order, family, genus, species)
  
    if sequence == True:
      g.conn.execute('INSERT INTO Reference VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)', title, doi, author, journal, volume, issue, journal_date, page_from, page_to)
      g.conn.execute('INSERT INTO Sequence_Source VALUES(%s, %s, %s, %s, NOW()::date, %s)', seq_type, bp, seq, acc_no, doi)
      g.conn.execute('INSERT INTO Has VALUES(%s, %s, %s)', genus, species, acc_no)
      g.conn.execute('INSERT INTO Submit_Sqn VALUES(%s, %s, %s, %s, NOW()::date)', session['user']['email'], genus, species, acc_no)
    if occurrence == True:
      g.conn.execute('INSERT INTO Occ_records VALUES(%s, %s, %s, %s, %s, %s, %s)', time, occ_type, location, latitude, longitude, genus, species)
      g.conn.execute('INSERT INTO Submit_Occ VALUES(%s, %s, %s, %s, %s, %s, NOW()::date)', session['user']['email'], time, latitude, longitude, genus, species)
    return redirect(url_for('dashboard', uname=session['user']['username']))  
  return render_template('submit.html', error=error, otype = otype, location=location, uname=session['user']['username'])
示例#7
0
    def validate(self, data):
        """
        Validate additionally if equipment location is a valid country name
        :param data: Equipment data dict
        :return:
        """
        if 'location' in data and data['location'] not in dict(countries_for_language(LANGUAGE_CODE)).values():
            raise serializers.ValidationError(f'Location "{data["location"]}" is invalid. A country is expected')

        return super().validate(data)
示例#8
0
    def __init__(self):
        self.geolocator = Nominatim(user_agent="xxx")

        countries = dict(countries_for_language('de'))

        country_names_german = list(countries.values())

        country_names_german.append('EU')
        country_names_german.append('Europäische Union')
        self.country_names_german = country_names_german
示例#9
0
def country_name_for_isocode(iso_code, lang=ICELANDIC_LANG_ISOCODE):
    """ Return country name for an ISO 3166-1 alpha-2 code. """
    assert len(iso_code) == 2
    assert len(lang) == 2
    iso_code = iso_code.upper()
    lang = lang.lower()
    if lang not in available_languages():
        return None
    countries = dict(countries_for_language(lang))
    return countries.get(iso_code)
示例#10
0
def locations_worldmap():
    """ Render world map locations page """
    period = request.args.get("period")
    days = 7 if period == "week" else _TOP_LOC_PERIOD

    d = world_map_data(days=days)
    n = dict(countries_for_language("is"))
    return render_template(
        "locations/locations-worldmap.html",
        country_data=d,
        country_names=n,
        period=period,
    )
示例#11
0
def locations_worldmap():
    """ Render world map locations page """
    period = request.args.get("period")
    days = 7 if period == "week" else _TOP_LOC_PERIOD

    d = world_map_data(days=days)
    n = dict(countries_for_language("is"))
    return render_template(
        "locations/locations-worldmap.html",
        country_data=d,
        country_names=n,
        period=period,
    )
示例#12
0
文件: geo.py 项目: Loknar/Greynir
def isocode_for_country_name(country_name, lang=ICELANDIC_LANG_ISOCODE):
    """ Return the ISO 3166-1 alpha-2 code for a country
        name in the specified language (two-char ISO 639-1) """
    assert len(lang) == 2
    lang = lang.lower()
    if lang not in available_languages():
        return None
    countries = countries_for_language(lang)  # This is cached by module
    for iso_code, name in countries:
        if name == country_name:
            return iso_code
    if lang in COUNTRY_NAME_TO_ISOCODE_ADDITIONS:
        return COUNTRY_NAME_TO_ISOCODE_ADDITIONS[lang].get(country_name)
    return None
示例#13
0
def locations_worldmap():
    """ Render world map locations page. """
    period = request.args.get("period")
    days = days_from_period_arg(period, _TOP_LOC_PERIOD)

    d = world_map_data(days=days)
    n = dict(countries_for_language("is"))

    return render_template(
        "locations/worldmap.html",
        title="Heimskort",
        country_data=d,
        country_names=n,
        period=period,
    )
示例#14
0
def getcountries():
    countries = []
    for language in available_languages():
        countries.append(dict(countries_for_language(language)))
    dictionary = dict([(key, []) for key in countries[96]])

    #7 96 204 263
    for country in dictionary:

        dictionary[country].append(countries[7][country])
        dictionary[country].append(countries[96][country])
        dictionary[country].append(countries[204][country])
        dictionary[country].append(countries[263][country])
        dictionary[country] = list(set(dictionary[country]))
    return (dictionary)
示例#15
0
def isocode_for_country_name(country_name, lang=ICELANDIC_LANG_ISOCODE):
    """ Return the ISO 3166-1 alpha-2 code for a country
        name in the specified language (two-char ISO 639-1). """
    assert len(lang) == 2
    lang = lang.lower()
    if lang not in available_languages():
        return None
    # Hardcoded mappings take precedence
    if lang in COUNTRY_NAME_TO_ISOCODE_ADDITIONS:
        if country_name in COUNTRY_NAME_TO_ISOCODE_ADDITIONS[lang]:
            return COUNTRY_NAME_TO_ISOCODE_ADDITIONS[lang][country_name]
    countries = countries_for_language(lang)  # This is cached by module
    uc_cn = capitalize_placename(country_name)
    for iso_code, name in countries:
        if name == country_name or name == uc_cn:
            return iso_code
    return None
示例#16
0
def signup():
    """Sign Up page route"""
    countries = dict(countries_for_language('en'))
    if request.method == "POST":
        req = request.form
        missing = []
        for field, input in req.items():
            if input == "":
                missing.append(field)

        if missing:
            feedback = " {}".format(', '.join(missing))
            return render_template('signup.html',
                                   countries=countries,
                                   feedback=feedback)
        if req["Email"] != req["Confirm email"]:
            return render_template('signup.html',
                                   countries=countries,
                                   dont_match="Emails don't match")
        if req["Password"] != req["Confirm password"]:
            return render_template('signup.html',
                                   countries=countries,
                                   dont_match="Passwords don't match")
        """adding data to database in SQLAlquemy"""
        new_user = User(first_name=req["First name"],
                        last_name=req["Last name"],
                        email=req["Email"],
                        password=req["Password"],
                        country=req["countries"])
        db.session.add(new_user)
        try:
            db.session.commit()
        except Exception as e:
            print(e)
            db.session.rollback()
            return render_template("signup.html",
                                   countries=countries,
                                   feedback="email already exists")
        finally:
            db.session.close()
            return render_template("signup.html",
                                   countries=countries,
                                   success="Successful Registration")
    return render_template('signup.html', countries=countries)
示例#17
0
def getRelationships(request, departure_country):
    """
    sends back the relationships that are registered in the the db and there status
    :param request:
    :param departure_country:
    :return: json response
    """
    if request.method == "GET":
        COUNTRY_DICTIONARY = dict(countries_for_language('en'))
        country_iso = list(COUNTRY_DICTIONARY.keys())[list(
            COUNTRY_DICTIONARY.values()).index(departure_country)]
        query = Relationship.objects.filter(
            departure_country__startswith=country_iso)

        return HttpResponse(serializers.serialize("json", query),
                            content_type='application/json')

    else:
        return HttpResponseNotFound('<h1>Page not found</h1>')
示例#18
0
class CreateNewList(forms.ModelForm):
    person_name = forms.CharField(widget=forms.TextInput(
        attrs={
            'class': 'form-control',
            'placeholder': 'Person Name *'
        }))
    person_age = forms.IntegerField(widget=forms.NumberInput(
        attrs={
            'class': 'form-control',
            'placeholder': 'Enter Age *'
        }))
    class_preference = (
        ("No Preference*", "No Preference*"),
        ("Lower", "Lower"),
        ("Middle", "Middle"),
        ("Upper", "Upper"),
        ("Side Lower", "Side Lower"),
        ("Side Upper", "Side Upper"),
    )
    class_gender = (
        ("Male", "Male"),
        ("Female", "Female"),
        ("Transgender", "Transgender"),
    )
    countries = countries_for_language('en')

    gender = forms.CharField(widget=forms.Select(
        choices=class_gender, attrs={'class': 'form-control'}))
    preference = forms.CharField(widget=forms.Select(
        choices=class_preference, attrs={'class': 'form-control'}))
    nationality = forms.CharField(widget=forms.Select(
        choices=countries, attrs={'class': 'form-control'}),
                                  initial='IN')

    class Meta:
        model = Person
        fields = [
            "person_name",
            "person_age",
            "gender",
            "preference",
            "nationality",
        ]
示例#19
0
def getRelations(request, departure_country):
    """
    same as the getRelationships() but gives less info and harder to scale and maintain
    :param request:
    :param departure_country:
    :return:
    """
    COUNTRY_DICTIONARY = dict(countries_for_language('en'))
    country_iso = list(COUNTRY_DICTIONARY.keys())[list(
        COUNTRY_DICTIONARY.values()).index(departure_country)]
    query = Relationship.objects.filter(
        departure_country__startswith=country_iso)
    # preparing response
    resp = {}
    for i in range(1, 5):
        resp[i] = {}

    for item in query:
        resp[int(item.status)][item.arrival_country] = item.arrival_country

    return JsonResponse(resp)
示例#20
0
class Relationship(models.Model):
    COUNTRY_DICTIONARY = dict(countries_for_language('en'))
    COUNTRY_DICTIONARY['*'] = 'all'
    COUNTRIES = [('*', 'all')
                 ]  # to signal the same status for all the countries
    for key in list(COUNTRY_DICTIONARY.keys()):
        COUNTRIES += [(key, COUNTRY_DICTIONARY[key])]
    COUNTRIES = tuple(COUNTRIES)
    OPENNESS = (
        ('1', 'open'),
        ('2', 'open with restrictions'),
        ('3', 'closed'),
        ('4', 'unknown'),
    )
    departure_country = models.CharField(max_length=30, choices=COUNTRIES)
    arrival_country = models.CharField(max_length=30, choices=COUNTRIES)
    status = models.CharField(max_length=1, choices=OPENNESS, default='4')
    info = models.CharField(max_length=160, default="")

    def __str__(self):
        return f"from {self.COUNTRY_DICTIONARY[self.departure_country]} to {self.COUNTRY_DICTIONARY[self.arrival_country]}"
示例#21
0
def translate_countries(countries, filename):
    country_codes = {name: code for code, name in countries_for_language('de')}
    country_data = {
        al2: country_pattern(country.name, al2, country.alpha3,
                             country.numeric)
        for al2, country in countries_by_alpha2.items()
    }
    translated_countries = {
        country_codes.get(country,
                          parse_mistyped_countries(country, country_codes))
        for country in countries
    }
    parsed_countries = [
        country_data.get(country, country_pattern(country))
        for country in translated_countries
    ]
    unrecognized_countries = [
        country for country in parsed_countries if not country['alpha2']
    ]
    for country in unrecognized_countries:
        logger.warning(f'unrecognized country in report '
                       f'{filename}: {country["name"]}')
    return parsed_countries
示例#22
0
    def getCountry(self):
        if self.country is None:
            try:
                location = self.location.split(',')[1].strip(' ')
            except:
                location = ''
            countries = dict(countries_for_language('en'))

            if location=='UK':
                self.country = 'GB'
                return self.country
            elif location=='NZ':
                self.country = 'NZ'
                return self.country
            elif len(location)==2:
                self.country = 'US'
                return self.country
        
            for cCode, country in countries.items():
                if country == location:
                    self.country = cCode
        
        return self.country
示例#23
0
def publishers_to_csv(publist):
    '''
    Indiza la lista de Publishers en un archivo .csv
    '''
    filewriter = csv.writer(open('data/publisher-dim.csv', 'w'),
                            delimiter=',',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
    countries = dict(countries_for_language('en'))
    regions = [
        'Africa', 'Asia', 'The Caribbean', 'Central America', 'Europe',
        'North America', 'Oceania', 'South America'
    ]
    index = 0
    filewriter.writerow(['id', 'name', 'country', 'region'])
    for element in publist:
        randcountry = countries[random.choice(list(countries.keys()))]
        index += 1
        filewriter.writerow([
            index,
            element.replace(",", "-"),
            randcountry.replace(",", "-"),
            random.choice(regions)
        ])
示例#24
0
from PyQt5 import QtCore, QtGui, QtWidgets, Qt
from PyQt5.QtCore import QStringListModel, QDate
from PyQt5.QtWidgets import QCompleter, QLineEdit
from country_list import countries_for_language
from array import *
import numpy as np
import csv

#####################################################################################################################
#                                                                                                                   #
#   The following code are global functions and data/variables used by Clinical Data form completer components      #
#                                                                                                                   #
#####################################################################################################################

# Load the list of countries
countries = dict(countries_for_language('en'))
data = []
for i, v in enumerate(countries):
    data.append(countries[v])

# reading the csv files containing the available diseases
with open('src/data/ICD10_Topography.csv', 'r') as f:
    reader = csv.reader(f)
    icd = list(reader)
    icd.pop(0)
with open('src/data/ICD10_Topography_C.csv', 'r') as f:
    reader = csv.reader(f)
    icdc = list(reader)
    icdc.pop(0)
with open('src/data/ICD10_Morphology.csv', 'r') as f:
    reader = csv.reader(f)
示例#25
0
from django.db import models
from django.contrib.auth.models import User

from country_list import countries_for_language

# Countries names API
init_country_list = dict(countries_for_language("en"))


def countries_names(countries, all_countries_names=None):
    """
    Tranform countries to tuple of tuples for model field `choices` attribute
    """

    all_countries = []

    for key, value in countries:
        all_countries.append((f"{value}-{key}", value))

    all_countries_names = tuple(all_countries)

    return all_countries_names


# Variable for model field `choices` attribute
ADDRESS_TYPE = (("billig", "Billing"), ("shipping", "Shipping"))
COUNTRY_LIST = countries_names(init_country_list.items())


class Billing(models.Model):
    """Customer billing information class"""
示例#26
0
    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


w = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

countries = list(dict(countries_for_language('en')).values())
# print(countries)
countries_vec = [w[country] for country in countries if country in w]

model = AgglomerativeClustering(n_clusters=5, linkage='ward')
model.fit(countries_vec)

fig = plt.figure()
ax = fig.add_subplot(111)

# plot the top three levels of the dendrogram
plot_dendrogram(model, labels=countries)

plt.show()
示例#27
0
def get_detail(cat, tmdb_id):
    request_url = 'https://api.themoviedb.org/3/{}/{}?append_to_response=credits,alternative_titles,external_ids,combined_credits&api_key={}&include_image_language=en,null&language=zh-CN'.format(
        cat, tmdb_id, tmdb_key)
    res = requests.get(request_url).json()
    tmdb_id = res.get('id')
    imdb_id = res.get('external_ids', {}).get('imdb_id', '')
    if cat == 'person':
        zh_name = get_zh_name(tmdb_id)
    else:
        zh_name = res.get('title', res.get('name', ''))
    name = res.get('original_title') or res.get('original_name') or res.get(
        'name')
    cast = []
    season_info = []
    trakt_rating = '0.0'
    yt_key = ''
    date = ''
    imdb_rating = ''
    if cat == 'movie' or cat == 'tv':
        date = res.get('release_date') or res.get('first_air_date') or ''
        genres = [
            '#' + (genres_dic.get(i.get('name')) or i.get('name'))
            for i in res.get('genres', [])
        ]
        cast = [
            get_zh_name(item.get('id')) or item.get('name')
            for item in res.get('credits', {}).get('cast', [])[:5]
        ]
        if cat == 'movie':
            imdb_rating = get_imdb_rating(imdb_id) if cat == 'movie' else ''
        if cat == 'tv':
            trakt_headers = {'trakt-api-key': trakt_key}
            trakt_rating = str(
                requests.get(
                    'https://api.trakt.tv/shows/{}/ratings'.format(imdb_id),
                    headers=trakt_headers).json()
                ['rating'])[:3] if imdb_id else '0.0'
            season_info = [
                '第{}季 - 共{}集'.format(item.get('season_number'),
                                     item.get('episode_count'))
                for item in res.get('seasons', [])
                if not item.get('season_number') == 0
            ]
    birthday = res.get('birthday', '')
    deathday = res.get('deathday', '')
    a_works = []
    d_works = []
    if cat == 'person':
        a_credits = res.get('combined_credits', {}).get('cast', [])
        a_credits.sort(reverse=True, key=get_year)
        a_works = [
            '{} - {}'.format(get_year(item), item.get('name',
                                                      item.get('title')))
            for item in a_credits[:10] if get_year(item)
        ]
        d_credits = res.get('combined_credits', {}).get('crew', [])
        d_credits.sort(reverse=True, key=get_year)
        d_credits_fixed = [
            item for item in d_credits if item.get('job') == 'Director'
        ]
        d_credits_fixed.sort(reverse=True, key=get_year)
        d_works = [
            '{} - {}'.format(get_year(item), item.get('name',
                                                      item.get('title')))
            for item in d_credits_fixed[:10] if get_year(item)
        ]
    dic = {
        'poster':
        '' if cat == 'person' else res.get('poster_path'),
        'profile':
        '' if not cat == 'person' else res.get('profile_path'),
        'zh_name':
        zh_name,
        'name':
        name,
        'year':
        '' if cat == 'person' else date[:4],
        'des':
        res.get('overview', ''),
        'trailer':
        '' if cat == 'person' else get_trailer(cat, tmdb_id),
        'director':
        '' if cat == 'person' else get_zh_name(
            next((item for item in res.get('credits', {}).get('crew', [])
                  if item.get('job') == 'Director'), {}).get('id', '')),
        'genres':
        '' if cat == 'person' else ' '.join(genres[:2]),
        'country':
        dict(countries_for_language('zh_CN')).get(
            next((item for item in res.get('production_countries', [])),
                 {}).get('iso_3166_1'), '') if not cat == 'person' else '',
        'lang':
        ''
        if cat == 'person' else langcode.get(res.get('original_language'), ''),
        'date':
        date,
        'lenth':
        res.get('runtime', '') or next(
            (i for i in res.get('episode_run_time', [])), ''),
        'creator':
        '' if not cat == 'tv' else get_zh_name(
            next((item
                  for item in res.get('created_by', [])), {}).get('id', '')),
        'cast':
        '' if cat == 'person' else '\n         '.join(cast),
        'imdb_rating':
        '' if not imdb_rating else '#IMDB_{} {}'.format(
            imdb_rating[:1], imdb_rating),
        'trakt_rating':
        '' if trakt_rating == '0.0' else '#Trakt_' + trakt_rating[:1] + ' ' +
        trakt_rating,
        'network':
        '' if not cat == 'tv' else re.sub(
            ' ', '_',
            next((i for i in res.get('networks', [])), {}).get('name', '')),
        'status':
        status_dic.get(res.get('status'), ''),
        'season_info':
        '' if not cat == 'tv' else '\n'.join(season_info),
        'birthday':
        birthday,
        'deathday':
        deathday,
        'age':
        get_age(birthday, deathday) if birthday else '',
        'a_works':
        '' if not cat == 'person' else '\n'.join(a_works),
        'd_works':
        '' if not cat == 'person' else '\n'.join(d_works),
    }
    return dic
示例#28
0
from country_list import countries_for_language

countries = dict(countries_for_language("en"))


def countrylist():
    set = []

    for key, value in countries.items():
        set.append(f"{key} - {value}")

    return set


def countryconvert(ISO):
    return countries[ISO]
""" This file takes a .json file of strings comprised of addresses and/or locations and outputs
a list of dicts of {address:A, date_iso:B, ranking:C}. """

import json

from tqdm import tqdm
import datefinder
from country_list import countries_for_language
import pandas as pd

import config as args

TZ_FORMAT = 'T00:00:00Z'
EN_COUNTRY_DICT = dict(countries_for_language('en'))
DE_COUNTRY_DICT = dict(countries_for_language('de'))
EN_CITY_PANDAS = pd.read_csv('../data_json/GeoLite2-City-Locations-en.csv')
DE_CITY_PANDAS = pd.read_csv('../data_json/GeoLite2-City-Locations-de.csv')


def string_parser(unparsed_str):
    """ takes an unparsed string and returns a dict of the wanted format """
    date_iso = ''
    address = ''
    address_ranking = 0
    date_ranking = 0

    for datefinder_output in datefinder.find_dates(unparsed_str, index=True):
        # there should only be one couple but the output of datefinder is a generator
        # so using a for is mostly for convenience
        date, date_idx = datefinder_output
        date_iso = str(date.date())
示例#30
0
def main():
    parser = argparse.ArgumentParser(
        description="concat_meta_tsv",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser = argparse.ArgumentParser()
    parser.add_argument("--inspq_meta", default='data/sgil_extract.tsv', help="The LNSPQ .tsv")
    parser.add_argument("--nextstrain_metadata", help="The .tsv that comes by "
                                                                                       "with nextstrain")
    parser.add_argument("--fasta_dir", default=None, help="The modified .tsv")
    parser.add_argument("--output", default='data/merged_metadata.tsv', help="The modified .tsv")
    parser.add_argument("--subsample", '-s', type=int,
                        default=None, help="A metadata file with N sample and all QC ones")


    args = parser.parse_args()
    in_path = args.nextstrain_metadata
    out_path = args.output
    subsample = args.subsample
    lnspq_path = args.inspq_meta
    fasta_dir = args.fasta_dir
    # exluded = args.nextstrain_exclude

    gsaid_df = pd.read_csv(in_path, sep='\t')
    lnspq_df = pd.read_csv(lnspq_path, sep='\t')

    # traduction from fr to en
    traduc = {'NO_LSPQ': 'strain',
              'AGE': 'age',
              'SEX': 'sex',
              'RSS_PATIENT': 'rss',
              'VOYAGE_PAYS_1': 'country_exposure',
              'DATE_PRELEV': 'date',
              'DATE_RECU': 'date_submitted',
              'CH': 'originating_lab',
              'POSTAL_CODE': 'rta'}
    lnspq_df.rename(columns=traduc, inplace=True)

    lnspq_df['country_exposure'] = [unidecode.unidecode(p.title()) for p in lnspq_df['country_exposure']]
    # lnspq_df['date_submitted'] = "{}s".format(datetime.datetime.today())
    pays_qc = {k: unidecode.unidecode(v) for k, v in countries_for_language('fr_CA')}

    pays_anglo = dict(countries_for_language('en'))
    # Fix non stadard names
    pays_anglo['US'] ='USA'
    pays_anglo['HK'] = 'Hong Kong'
    pays_anglo['CZ'] = 'Czech Republic'
    pays_anglo['CD'] = 'Democratic Republic of the Congo'

    trans = {pays_qc[code]: pays_anglo[code] for code in pays_qc.keys()}
    trans['Aucun_Voyage'] = '?'
    lnspq_df['country_exposure'].replace(trans, inplace=True)
    lnspq_df['rta_exposure'] = lnspq_df['country_exposure']

    fastas = glob.glob("{}/*fasta".format(fasta_dir))
    fasta_id_len = count_fasta_len(fastas)

    for fid, l in fasta_id_len.items():
        lnspq_df.loc[lnspq_df['strain'] == fid, 'lenth'] = l

    lnspq_df['virus'] = 'ncov'
    lnspq_df['title'] = 'CoVSeQ - Covid Sequencing Quebec'
    lnspq_df['country'] = 'Quebec'
    lnspq_df['location'] = 'Quebec'
    lnspq_df['division'] = 'Quebec'
    lnspq_df['region'] = 'North America'
    lnspq_df['submitting_lab'] = 'LSPQ'


    if fasta_dir:
        lnspq_df['url'] = 'http://www.covseq.ca/data/{}'.format(os.path.basename(fasta_dir.strip('/')))
    else:
        lnspq_df['url'] = ''

    # add rta and rss entry to world

    # still need to fix Iles 'Turques-Caiques' and 'Iles Vierges (E-U)',

    neighbourg = ['New York', 'Ontario', 'Vermont', 'New Hampshire',
                  "Massachusetts", 'Maine', 'New Brunswick', 'Grand Princess']

    gsaid_df.loc[gsaid_df['region'] != 'North America', 'rss'] = gsaid_df['country']
    gsaid_df.loc[gsaid_df['region'] != 'North America', 'rta'] = gsaid_df['country']
    gsaid_df.loc[gsaid_df['region'] == 'North America', 'rss'] = gsaid_df['country']


    gsaid_df.loc[gsaid_df['region'] == 'North America', 'rta'] = gsaid_df['country']
    gsaid_df.loc[gsaid_df['division'].isin(neighbourg), 'rss'] = gsaid_df['division']

    gsaid_df['rta_exposure'] = gsaid_df['country_exposure']


    # neighbourg

    # rta_country

    # table.assign(region=)
    pd.concat([lnspq_df, gsaid_df], sort=False).to_csv(out_path, sep='\t', index=False)

    if subsample:
        name = os.path.basename(out_path)
        path = os.path.dirname(out_path)
        s_path = '{}/sampled_{}'.format(path, name)
        print('subsample with {} point in {}'.format(subsample, s_path))
        s_df = gsaid_df.iloc[random.sample(range(len(gsaid_df)), subsample)]

        # make sure root virus is in data
        extra = []
        for s in open('../config/include.txt').read().splitlines():
            if s not in s_df['strain']:
                extra = extra + [gsaid_df.loc[gsaid_df['strain'] == s]]

        pd.concat([lnspq_df, s_df] + extra).to_csv(s_path, sep='\t', index=False)
countrycodes = dict(zip(countrycodes['Code'], countrycodes['Country']))
countrycodes['NA'] = 'Namibia'
del countrycodes[np.nan]
countrycodes = {k.lower(): v.lower() for k, v in countrycodes.items()}

pd.DataFrame(columns=['date', 'country', 'count']).to_csv('tweetcounts.csv',
                                                          index=False)
S3 = boto3.resource('s3')
BUCKET = 'coronavirus-analysis'
conn = S3.Bucket(BUCKET)
fns = [
    object_summary.key
    for object_summary in conn.objects.filter(Prefix="TweetPickles/")
]

imported_countries = dict(countries_for_language('en'))
countries = [x.lower() for x in list(imported_countries.values())]


def countrycheck(row):
    if not any(country in row for country in countries):
        return row
    else:
        for country in countries:
            if country in row:
                idx = row.find(country)
                nextchar = row[idx:idx + len(country) + 1]
                if len(row) > idx + len(country) + 1:
                    continue
                else:
                    return country