def get_wc_jsons(data_path): """ Function for finding all JSONs in event data that are world cup related. inputs ---- data_path: relative path to where events data is stored returns ----- wc_jsons: list of json names """ countries = dict(countries_for_language('en')) country_names = list(countries.values()) + ['England'] wc_jsons = [] for r, d, f in os.walk(data_path): for f_ in f: try: with open(data_path + os.sep + f_) as f: event_json = json.load(f) for event in event_json[:2]: team = (event['team']['name']) if team in country_names: wc_jsons.append(f_) except: with open(data_path + os.sep + f_, encoding='utf-8') as f: event_json = json.load(f) for event in event_json[:2]: team = (event['team']['name']) if team in country_names: wc_jsons.append(f_) return wc_jsons
def select_word(self, array_number): if array_number == 1: countries = dict(countries_for_language('en')) for i in countries.values(): self.word_list.append(i) random_number = random.randint(0, len(self.word_list)) random_word = self.word_list[random_number] return random_word.lower()
def get_country_df(df): country = [] count = [] for k in dict(countries_for_language('en')).values(): len_country = len(df[df['countries'].map(set([k.lower()]).issubset)]) country.append(k.lower()) count.append(len_country) return pd.DataFrame({'country': country, 'count': count})
def signup(): countries = dict(countries_for_language('en')) if request.method == "POST": name = request.form["name"] if name: return render_template('signup.html', name="Please fill out this field.") return render_template('signup.html', countries=countries)
def translate_countries(lang="en"): try: return [ country.lower() for country in list(dict(countries_for_language(lang)).values()) ] except Exception as err: return translate_countries(lang="en")
def submit(uname): if not 'user' in session: return redirect('/login') error = None country = dict(countries_for_language('en')) location = country.values() otype = ['preserved specimen', 'human observation', 'machine observation'] if request.method == 'POST': kingdom = request.form['kingdom'] phylum = request.form['phylum'] org_class = request.form['org_class'] order = request.form['order'] family = request.form['family'] genus = request.form['genus'] species = request.form['species'] seq_type = request.form['seq_type'] bp = request.form['bp'] seq = request.form['seq'] acc_no = request.form['acc_no'] title = request.form['title'] doi = request.form['doi'] author = request.form['author'] journal = request.form['journal'] volume = request.form['volume'] issue = request.form['issue'] journal_date = request.form['journal_date'] page_from = request.form['page_from'] page_to = request.form['page_to'] time = request.form['time'] occ_type = request.form['occ_type'] location = request.form['location'] latitude = request.form['latitude'] longitude = request.form['longitude'] sequence = 'sequence' in request.form occurrence = 'occurrence' in request.form cursor = g.conn.execute("SELECT * FROM Organism WHERE genus=%s AND species=%s", genus, species) org = [] for result in cursor: org.append(result) cursor.close() if not org: g.conn.execute('INSERT INTO Organism VALUES(%s, %s, %s, %s, %s, %s, %s)', kingdom, phylum, org_class, order, family, genus, species) if sequence == True: g.conn.execute('INSERT INTO Reference VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)', title, doi, author, journal, volume, issue, journal_date, page_from, page_to) g.conn.execute('INSERT INTO Sequence_Source VALUES(%s, %s, %s, %s, NOW()::date, %s)', seq_type, bp, seq, acc_no, doi) g.conn.execute('INSERT INTO Has VALUES(%s, %s, %s)', genus, species, acc_no) g.conn.execute('INSERT INTO Submit_Sqn VALUES(%s, %s, %s, %s, NOW()::date)', session['user']['email'], genus, species, acc_no) if occurrence == True: g.conn.execute('INSERT INTO Occ_records VALUES(%s, %s, %s, %s, %s, %s, %s)', time, occ_type, location, latitude, longitude, genus, species) g.conn.execute('INSERT INTO Submit_Occ VALUES(%s, %s, %s, %s, %s, %s, NOW()::date)', session['user']['email'], time, latitude, longitude, genus, species) return redirect(url_for('dashboard', uname=session['user']['username'])) return render_template('submit.html', error=error, otype = otype, location=location, uname=session['user']['username'])
def validate(self, data): """ Validate additionally if equipment location is a valid country name :param data: Equipment data dict :return: """ if 'location' in data and data['location'] not in dict(countries_for_language(LANGUAGE_CODE)).values(): raise serializers.ValidationError(f'Location "{data["location"]}" is invalid. A country is expected') return super().validate(data)
def __init__(self): self.geolocator = Nominatim(user_agent="xxx") countries = dict(countries_for_language('de')) country_names_german = list(countries.values()) country_names_german.append('EU') country_names_german.append('Europäische Union') self.country_names_german = country_names_german
def country_name_for_isocode(iso_code, lang=ICELANDIC_LANG_ISOCODE): """ Return country name for an ISO 3166-1 alpha-2 code. """ assert len(iso_code) == 2 assert len(lang) == 2 iso_code = iso_code.upper() lang = lang.lower() if lang not in available_languages(): return None countries = dict(countries_for_language(lang)) return countries.get(iso_code)
def locations_worldmap(): """ Render world map locations page """ period = request.args.get("period") days = 7 if period == "week" else _TOP_LOC_PERIOD d = world_map_data(days=days) n = dict(countries_for_language("is")) return render_template( "locations/locations-worldmap.html", country_data=d, country_names=n, period=period, )
def isocode_for_country_name(country_name, lang=ICELANDIC_LANG_ISOCODE): """ Return the ISO 3166-1 alpha-2 code for a country name in the specified language (two-char ISO 639-1) """ assert len(lang) == 2 lang = lang.lower() if lang not in available_languages(): return None countries = countries_for_language(lang) # This is cached by module for iso_code, name in countries: if name == country_name: return iso_code if lang in COUNTRY_NAME_TO_ISOCODE_ADDITIONS: return COUNTRY_NAME_TO_ISOCODE_ADDITIONS[lang].get(country_name) return None
def locations_worldmap(): """ Render world map locations page. """ period = request.args.get("period") days = days_from_period_arg(period, _TOP_LOC_PERIOD) d = world_map_data(days=days) n = dict(countries_for_language("is")) return render_template( "locations/worldmap.html", title="Heimskort", country_data=d, country_names=n, period=period, )
def getcountries(): countries = [] for language in available_languages(): countries.append(dict(countries_for_language(language))) dictionary = dict([(key, []) for key in countries[96]]) #7 96 204 263 for country in dictionary: dictionary[country].append(countries[7][country]) dictionary[country].append(countries[96][country]) dictionary[country].append(countries[204][country]) dictionary[country].append(countries[263][country]) dictionary[country] = list(set(dictionary[country])) return (dictionary)
def isocode_for_country_name(country_name, lang=ICELANDIC_LANG_ISOCODE): """ Return the ISO 3166-1 alpha-2 code for a country name in the specified language (two-char ISO 639-1). """ assert len(lang) == 2 lang = lang.lower() if lang not in available_languages(): return None # Hardcoded mappings take precedence if lang in COUNTRY_NAME_TO_ISOCODE_ADDITIONS: if country_name in COUNTRY_NAME_TO_ISOCODE_ADDITIONS[lang]: return COUNTRY_NAME_TO_ISOCODE_ADDITIONS[lang][country_name] countries = countries_for_language(lang) # This is cached by module uc_cn = capitalize_placename(country_name) for iso_code, name in countries: if name == country_name or name == uc_cn: return iso_code return None
def signup(): """Sign Up page route""" countries = dict(countries_for_language('en')) if request.method == "POST": req = request.form missing = [] for field, input in req.items(): if input == "": missing.append(field) if missing: feedback = " {}".format(', '.join(missing)) return render_template('signup.html', countries=countries, feedback=feedback) if req["Email"] != req["Confirm email"]: return render_template('signup.html', countries=countries, dont_match="Emails don't match") if req["Password"] != req["Confirm password"]: return render_template('signup.html', countries=countries, dont_match="Passwords don't match") """adding data to database in SQLAlquemy""" new_user = User(first_name=req["First name"], last_name=req["Last name"], email=req["Email"], password=req["Password"], country=req["countries"]) db.session.add(new_user) try: db.session.commit() except Exception as e: print(e) db.session.rollback() return render_template("signup.html", countries=countries, feedback="email already exists") finally: db.session.close() return render_template("signup.html", countries=countries, success="Successful Registration") return render_template('signup.html', countries=countries)
def getRelationships(request, departure_country): """ sends back the relationships that are registered in the the db and there status :param request: :param departure_country: :return: json response """ if request.method == "GET": COUNTRY_DICTIONARY = dict(countries_for_language('en')) country_iso = list(COUNTRY_DICTIONARY.keys())[list( COUNTRY_DICTIONARY.values()).index(departure_country)] query = Relationship.objects.filter( departure_country__startswith=country_iso) return HttpResponse(serializers.serialize("json", query), content_type='application/json') else: return HttpResponseNotFound('<h1>Page not found</h1>')
class CreateNewList(forms.ModelForm): person_name = forms.CharField(widget=forms.TextInput( attrs={ 'class': 'form-control', 'placeholder': 'Person Name *' })) person_age = forms.IntegerField(widget=forms.NumberInput( attrs={ 'class': 'form-control', 'placeholder': 'Enter Age *' })) class_preference = ( ("No Preference*", "No Preference*"), ("Lower", "Lower"), ("Middle", "Middle"), ("Upper", "Upper"), ("Side Lower", "Side Lower"), ("Side Upper", "Side Upper"), ) class_gender = ( ("Male", "Male"), ("Female", "Female"), ("Transgender", "Transgender"), ) countries = countries_for_language('en') gender = forms.CharField(widget=forms.Select( choices=class_gender, attrs={'class': 'form-control'})) preference = forms.CharField(widget=forms.Select( choices=class_preference, attrs={'class': 'form-control'})) nationality = forms.CharField(widget=forms.Select( choices=countries, attrs={'class': 'form-control'}), initial='IN') class Meta: model = Person fields = [ "person_name", "person_age", "gender", "preference", "nationality", ]
def getRelations(request, departure_country): """ same as the getRelationships() but gives less info and harder to scale and maintain :param request: :param departure_country: :return: """ COUNTRY_DICTIONARY = dict(countries_for_language('en')) country_iso = list(COUNTRY_DICTIONARY.keys())[list( COUNTRY_DICTIONARY.values()).index(departure_country)] query = Relationship.objects.filter( departure_country__startswith=country_iso) # preparing response resp = {} for i in range(1, 5): resp[i] = {} for item in query: resp[int(item.status)][item.arrival_country] = item.arrival_country return JsonResponse(resp)
class Relationship(models.Model): COUNTRY_DICTIONARY = dict(countries_for_language('en')) COUNTRY_DICTIONARY['*'] = 'all' COUNTRIES = [('*', 'all') ] # to signal the same status for all the countries for key in list(COUNTRY_DICTIONARY.keys()): COUNTRIES += [(key, COUNTRY_DICTIONARY[key])] COUNTRIES = tuple(COUNTRIES) OPENNESS = ( ('1', 'open'), ('2', 'open with restrictions'), ('3', 'closed'), ('4', 'unknown'), ) departure_country = models.CharField(max_length=30, choices=COUNTRIES) arrival_country = models.CharField(max_length=30, choices=COUNTRIES) status = models.CharField(max_length=1, choices=OPENNESS, default='4') info = models.CharField(max_length=160, default="") def __str__(self): return f"from {self.COUNTRY_DICTIONARY[self.departure_country]} to {self.COUNTRY_DICTIONARY[self.arrival_country]}"
def translate_countries(countries, filename): country_codes = {name: code for code, name in countries_for_language('de')} country_data = { al2: country_pattern(country.name, al2, country.alpha3, country.numeric) for al2, country in countries_by_alpha2.items() } translated_countries = { country_codes.get(country, parse_mistyped_countries(country, country_codes)) for country in countries } parsed_countries = [ country_data.get(country, country_pattern(country)) for country in translated_countries ] unrecognized_countries = [ country for country in parsed_countries if not country['alpha2'] ] for country in unrecognized_countries: logger.warning(f'unrecognized country in report ' f'{filename}: {country["name"]}') return parsed_countries
def getCountry(self): if self.country is None: try: location = self.location.split(',')[1].strip(' ') except: location = '' countries = dict(countries_for_language('en')) if location=='UK': self.country = 'GB' return self.country elif location=='NZ': self.country = 'NZ' return self.country elif len(location)==2: self.country = 'US' return self.country for cCode, country in countries.items(): if country == location: self.country = cCode return self.country
def publishers_to_csv(publist): ''' Indiza la lista de Publishers en un archivo .csv ''' filewriter = csv.writer(open('data/publisher-dim.csv', 'w'), delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) countries = dict(countries_for_language('en')) regions = [ 'Africa', 'Asia', 'The Caribbean', 'Central America', 'Europe', 'North America', 'Oceania', 'South America' ] index = 0 filewriter.writerow(['id', 'name', 'country', 'region']) for element in publist: randcountry = countries[random.choice(list(countries.keys()))] index += 1 filewriter.writerow([ index, element.replace(",", "-"), randcountry.replace(",", "-"), random.choice(regions) ])
from PyQt5 import QtCore, QtGui, QtWidgets, Qt from PyQt5.QtCore import QStringListModel, QDate from PyQt5.QtWidgets import QCompleter, QLineEdit from country_list import countries_for_language from array import * import numpy as np import csv ##################################################################################################################### # # # The following code are global functions and data/variables used by Clinical Data form completer components # # # ##################################################################################################################### # Load the list of countries countries = dict(countries_for_language('en')) data = [] for i, v in enumerate(countries): data.append(countries[v]) # reading the csv files containing the available diseases with open('src/data/ICD10_Topography.csv', 'r') as f: reader = csv.reader(f) icd = list(reader) icd.pop(0) with open('src/data/ICD10_Topography_C.csv', 'r') as f: reader = csv.reader(f) icdc = list(reader) icdc.pop(0) with open('src/data/ICD10_Morphology.csv', 'r') as f: reader = csv.reader(f)
from django.db import models from django.contrib.auth.models import User from country_list import countries_for_language # Countries names API init_country_list = dict(countries_for_language("en")) def countries_names(countries, all_countries_names=None): """ Tranform countries to tuple of tuples for model field `choices` attribute """ all_countries = [] for key, value in countries: all_countries.append((f"{value}-{key}", value)) all_countries_names = tuple(all_countries) return all_countries_names # Variable for model field `choices` attribute ADDRESS_TYPE = (("billig", "Billing"), ("shipping", "Shipping")) COUNTRY_LIST = countries_names(init_country_list.items()) class Billing(models.Model): """Customer billing information class"""
# Distances between each pair of children # Since we don't have this information, we can use a uniform one for plotting distance = np.arange(children.shape[0]) # The number of observations contained in each cluster level no_of_observations = np.arange(2, children.shape[0]+2) # Create linkage matrix and then plot the dendrogram linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) w = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) countries = list(dict(countries_for_language('en')).values()) # print(countries) countries_vec = [w[country] for country in countries if country in w] model = AgglomerativeClustering(n_clusters=5, linkage='ward') model.fit(countries_vec) fig = plt.figure() ax = fig.add_subplot(111) # plot the top three levels of the dendrogram plot_dendrogram(model, labels=countries) plt.show()
def get_detail(cat, tmdb_id): request_url = 'https://api.themoviedb.org/3/{}/{}?append_to_response=credits,alternative_titles,external_ids,combined_credits&api_key={}&include_image_language=en,null&language=zh-CN'.format( cat, tmdb_id, tmdb_key) res = requests.get(request_url).json() tmdb_id = res.get('id') imdb_id = res.get('external_ids', {}).get('imdb_id', '') if cat == 'person': zh_name = get_zh_name(tmdb_id) else: zh_name = res.get('title', res.get('name', '')) name = res.get('original_title') or res.get('original_name') or res.get( 'name') cast = [] season_info = [] trakt_rating = '0.0' yt_key = '' date = '' imdb_rating = '' if cat == 'movie' or cat == 'tv': date = res.get('release_date') or res.get('first_air_date') or '' genres = [ '#' + (genres_dic.get(i.get('name')) or i.get('name')) for i in res.get('genres', []) ] cast = [ get_zh_name(item.get('id')) or item.get('name') for item in res.get('credits', {}).get('cast', [])[:5] ] if cat == 'movie': imdb_rating = get_imdb_rating(imdb_id) if cat == 'movie' else '' if cat == 'tv': trakt_headers = {'trakt-api-key': trakt_key} trakt_rating = str( requests.get( 'https://api.trakt.tv/shows/{}/ratings'.format(imdb_id), headers=trakt_headers).json() ['rating'])[:3] if imdb_id else '0.0' season_info = [ '第{}季 - 共{}集'.format(item.get('season_number'), item.get('episode_count')) for item in res.get('seasons', []) if not item.get('season_number') == 0 ] birthday = res.get('birthday', '') deathday = res.get('deathday', '') a_works = [] d_works = [] if cat == 'person': a_credits = res.get('combined_credits', {}).get('cast', []) a_credits.sort(reverse=True, key=get_year) a_works = [ '{} - {}'.format(get_year(item), item.get('name', item.get('title'))) for item in a_credits[:10] if get_year(item) ] d_credits = res.get('combined_credits', {}).get('crew', []) d_credits.sort(reverse=True, key=get_year) d_credits_fixed = [ item for item in d_credits if item.get('job') == 'Director' ] d_credits_fixed.sort(reverse=True, key=get_year) d_works = [ '{} - {}'.format(get_year(item), item.get('name', item.get('title'))) for item in d_credits_fixed[:10] if get_year(item) ] dic = { 'poster': '' if cat == 'person' else res.get('poster_path'), 'profile': '' if not cat == 'person' else res.get('profile_path'), 'zh_name': zh_name, 'name': name, 'year': '' if cat == 'person' else date[:4], 'des': res.get('overview', ''), 'trailer': '' if cat == 'person' else get_trailer(cat, tmdb_id), 'director': '' if cat == 'person' else get_zh_name( next((item for item in res.get('credits', {}).get('crew', []) if item.get('job') == 'Director'), {}).get('id', '')), 'genres': '' if cat == 'person' else ' '.join(genres[:2]), 'country': dict(countries_for_language('zh_CN')).get( next((item for item in res.get('production_countries', [])), {}).get('iso_3166_1'), '') if not cat == 'person' else '', 'lang': '' if cat == 'person' else langcode.get(res.get('original_language'), ''), 'date': date, 'lenth': res.get('runtime', '') or next( (i for i in res.get('episode_run_time', [])), ''), 'creator': '' if not cat == 'tv' else get_zh_name( next((item for item in res.get('created_by', [])), {}).get('id', '')), 'cast': '' if cat == 'person' else '\n '.join(cast), 'imdb_rating': '' if not imdb_rating else '#IMDB_{} {}'.format( imdb_rating[:1], imdb_rating), 'trakt_rating': '' if trakt_rating == '0.0' else '#Trakt_' + trakt_rating[:1] + ' ' + trakt_rating, 'network': '' if not cat == 'tv' else re.sub( ' ', '_', next((i for i in res.get('networks', [])), {}).get('name', '')), 'status': status_dic.get(res.get('status'), ''), 'season_info': '' if not cat == 'tv' else '\n'.join(season_info), 'birthday': birthday, 'deathday': deathday, 'age': get_age(birthday, deathday) if birthday else '', 'a_works': '' if not cat == 'person' else '\n'.join(a_works), 'd_works': '' if not cat == 'person' else '\n'.join(d_works), } return dic
from country_list import countries_for_language countries = dict(countries_for_language("en")) def countrylist(): set = [] for key, value in countries.items(): set.append(f"{key} - {value}") return set def countryconvert(ISO): return countries[ISO]
""" This file takes a .json file of strings comprised of addresses and/or locations and outputs a list of dicts of {address:A, date_iso:B, ranking:C}. """ import json from tqdm import tqdm import datefinder from country_list import countries_for_language import pandas as pd import config as args TZ_FORMAT = 'T00:00:00Z' EN_COUNTRY_DICT = dict(countries_for_language('en')) DE_COUNTRY_DICT = dict(countries_for_language('de')) EN_CITY_PANDAS = pd.read_csv('../data_json/GeoLite2-City-Locations-en.csv') DE_CITY_PANDAS = pd.read_csv('../data_json/GeoLite2-City-Locations-de.csv') def string_parser(unparsed_str): """ takes an unparsed string and returns a dict of the wanted format """ date_iso = '' address = '' address_ranking = 0 date_ranking = 0 for datefinder_output in datefinder.find_dates(unparsed_str, index=True): # there should only be one couple but the output of datefinder is a generator # so using a for is mostly for convenience date, date_idx = datefinder_output date_iso = str(date.date())
def main(): parser = argparse.ArgumentParser( description="concat_meta_tsv", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser = argparse.ArgumentParser() parser.add_argument("--inspq_meta", default='data/sgil_extract.tsv', help="The LNSPQ .tsv") parser.add_argument("--nextstrain_metadata", help="The .tsv that comes by " "with nextstrain") parser.add_argument("--fasta_dir", default=None, help="The modified .tsv") parser.add_argument("--output", default='data/merged_metadata.tsv', help="The modified .tsv") parser.add_argument("--subsample", '-s', type=int, default=None, help="A metadata file with N sample and all QC ones") args = parser.parse_args() in_path = args.nextstrain_metadata out_path = args.output subsample = args.subsample lnspq_path = args.inspq_meta fasta_dir = args.fasta_dir # exluded = args.nextstrain_exclude gsaid_df = pd.read_csv(in_path, sep='\t') lnspq_df = pd.read_csv(lnspq_path, sep='\t') # traduction from fr to en traduc = {'NO_LSPQ': 'strain', 'AGE': 'age', 'SEX': 'sex', 'RSS_PATIENT': 'rss', 'VOYAGE_PAYS_1': 'country_exposure', 'DATE_PRELEV': 'date', 'DATE_RECU': 'date_submitted', 'CH': 'originating_lab', 'POSTAL_CODE': 'rta'} lnspq_df.rename(columns=traduc, inplace=True) lnspq_df['country_exposure'] = [unidecode.unidecode(p.title()) for p in lnspq_df['country_exposure']] # lnspq_df['date_submitted'] = "{}s".format(datetime.datetime.today()) pays_qc = {k: unidecode.unidecode(v) for k, v in countries_for_language('fr_CA')} pays_anglo = dict(countries_for_language('en')) # Fix non stadard names pays_anglo['US'] ='USA' pays_anglo['HK'] = 'Hong Kong' pays_anglo['CZ'] = 'Czech Republic' pays_anglo['CD'] = 'Democratic Republic of the Congo' trans = {pays_qc[code]: pays_anglo[code] for code in pays_qc.keys()} trans['Aucun_Voyage'] = '?' lnspq_df['country_exposure'].replace(trans, inplace=True) lnspq_df['rta_exposure'] = lnspq_df['country_exposure'] fastas = glob.glob("{}/*fasta".format(fasta_dir)) fasta_id_len = count_fasta_len(fastas) for fid, l in fasta_id_len.items(): lnspq_df.loc[lnspq_df['strain'] == fid, 'lenth'] = l lnspq_df['virus'] = 'ncov' lnspq_df['title'] = 'CoVSeQ - Covid Sequencing Quebec' lnspq_df['country'] = 'Quebec' lnspq_df['location'] = 'Quebec' lnspq_df['division'] = 'Quebec' lnspq_df['region'] = 'North America' lnspq_df['submitting_lab'] = 'LSPQ' if fasta_dir: lnspq_df['url'] = 'http://www.covseq.ca/data/{}'.format(os.path.basename(fasta_dir.strip('/'))) else: lnspq_df['url'] = '' # add rta and rss entry to world # still need to fix Iles 'Turques-Caiques' and 'Iles Vierges (E-U)', neighbourg = ['New York', 'Ontario', 'Vermont', 'New Hampshire', "Massachusetts", 'Maine', 'New Brunswick', 'Grand Princess'] gsaid_df.loc[gsaid_df['region'] != 'North America', 'rss'] = gsaid_df['country'] gsaid_df.loc[gsaid_df['region'] != 'North America', 'rta'] = gsaid_df['country'] gsaid_df.loc[gsaid_df['region'] == 'North America', 'rss'] = gsaid_df['country'] gsaid_df.loc[gsaid_df['region'] == 'North America', 'rta'] = gsaid_df['country'] gsaid_df.loc[gsaid_df['division'].isin(neighbourg), 'rss'] = gsaid_df['division'] gsaid_df['rta_exposure'] = gsaid_df['country_exposure'] # neighbourg # rta_country # table.assign(region=) pd.concat([lnspq_df, gsaid_df], sort=False).to_csv(out_path, sep='\t', index=False) if subsample: name = os.path.basename(out_path) path = os.path.dirname(out_path) s_path = '{}/sampled_{}'.format(path, name) print('subsample with {} point in {}'.format(subsample, s_path)) s_df = gsaid_df.iloc[random.sample(range(len(gsaid_df)), subsample)] # make sure root virus is in data extra = [] for s in open('../config/include.txt').read().splitlines(): if s not in s_df['strain']: extra = extra + [gsaid_df.loc[gsaid_df['strain'] == s]] pd.concat([lnspq_df, s_df] + extra).to_csv(s_path, sep='\t', index=False)
countrycodes = dict(zip(countrycodes['Code'], countrycodes['Country'])) countrycodes['NA'] = 'Namibia' del countrycodes[np.nan] countrycodes = {k.lower(): v.lower() for k, v in countrycodes.items()} pd.DataFrame(columns=['date', 'country', 'count']).to_csv('tweetcounts.csv', index=False) S3 = boto3.resource('s3') BUCKET = 'coronavirus-analysis' conn = S3.Bucket(BUCKET) fns = [ object_summary.key for object_summary in conn.objects.filter(Prefix="TweetPickles/") ] imported_countries = dict(countries_for_language('en')) countries = [x.lower() for x in list(imported_countries.values())] def countrycheck(row): if not any(country in row for country in countries): return row else: for country in countries: if country in row: idx = row.find(country) nextchar = row[idx:idx + len(country) + 1] if len(row) > idx + len(country) + 1: continue else: return country