def load_incidence_data(db): url = 'http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1' output_file = 'incidences.xls' download_xls.get_file(url, output_file) contents = download_xls.read_file(output_file) print 'Inserting incidence data to db' c = db.cursor() for sheet in contents: if sheet in ['Readme Incidence', 'Reg_&_Global_Incidence']: continue rows = contents.get(sheet) headers = [] for i in range(len(rows)): if i == 0: headers = rows[i] else: data = rows[i] region = data[0] iso_code = data[1] country = data[2] disease = data[3] for j in range(4, len(data)): year = headers[j] value = data[j] if value: SQLite3.insert(c, 'incidents', (region, iso_code, country, disease, year, value)) db.commit()
def save_complete_map(): db = SQLite3('data.db') c = db.cursor() df = pd.read_csv('country_map_complete.csv') for i in df.index: who_country = df.ix[i]['who_country'] p_country = df.ix[i]['p_country'] SQLite3.insert(c, 'countries', [who_country, p_country]) db.commit()
def get_populations(): populations = [] db = SQLite3('data.db') c = db.cursor().execute(""" SELECT DISTINCT year, population_country_name as country FROM ( SELECT year, country_name FROM coverage UNION ALL SELECT year, country_name FROM incidents ) LEFT OUTER JOIN countries ON who_country_name = country_name WHERE population_country_name IS NOT NULL ORDER BY population_country_name, year; """) rows = c.fetchall() bar = pyprind.ProgBar(len(rows), title='Downloading population data') i = 0 for row in rows: country = row['country'] year = row['year'] url = 'http://api.population.io:80/1.0/population/%s/%s/' % (year, country) r = requests.get(url) r.raise_for_status() entries = r.json() total = 0 for entry in entries: total += entry['total'] data = (country, year, total) populations.append(data) bar.update() print 'Inserting population data into db' SQLite3.insertmany(c, 'populations', populations) db.commit()
def cluster(similarity=pearson): db = SQLite3().cursor() pins = Pins() words = Words() maxcount = pins.size() labels = [r[0] for r in words.set()] offset = 0 n = 1 sum_num = sum([i for i in range(1, maxcount)]) while True: pin_a = pins.find_by_offset(offset) pin_a_words = [w[0] for w in words.find_by_pinid(pin_a[0])] pin_a_wordcount = [pin_a_words.count(w) for w in list(set(labels))] # calculate distance of two pins. for i in range(offset + 1, maxcount): pin_b = pins.find_by_offset(i) pin_b_words = [w[0] for w in words.find_by_pinid(pin_b[0])] pin_b_wordcount = [pin_b_words.count(w) for w in list(set(labels))] # calculate distance of two pins print '[%s] %s / %s calculate score of %s and %s' % ( datetime.today().strftime('%Y-%m-%d %H:%M:%S'), n, sum_num, pin_a[0], pin_b[0]) sim = 1.0 - similarity(pin_a_wordcount, pin_b_wordcount) # save distance to database for cache clusters = Clusters() clusters.data['pin_id_a'] = pin_a[0] clusters.data['pin_id_b'] = pin_b[0] clusters.data['score'] = sim clusters.save() n += 1 if offset >= maxcount - 1: break offset += 1
def get_candidates(self, offset=0, n=5, threshold=0.8): db = SQLite3().cursor() m = [r for r in db.execute(''' select pins_a.pin_id, pins_a.image_url from clusters inner join pins pins_a on clusters.pin_id_a = pins_a.pin_id where clusters.score > ? order by clusters.score desc ''', (threshold,))] m = list(set(m)) m.sort() m.reverse() return m[offset:offset+n]
def get_top_matches(self, pin_id, n=5, threshold=0.8): q1 = ''' select clusters.score, pins_a.image_url, pins_b.image_url, pins_b.pin_id from clusters inner join pins pins_a on clusters.pin_id_a = pins_a.pin_id inner join pins pins_b on clusters.pin_id_b = pins_b.pin_id where clusters.pin_id_a = ? AND clusters.score > ? order by clusters.score desc limit ? ''' q2 = ''' select clusters.score, pins_b.image_url, pins_a.image_url, pins_a.pin_id from clusters inner join pins pins_a on clusters.pin_id_a = pins_a.pin_id inner join pins pins_b on clusters.pin_id_b = pins_b.pin_id where clusters.pin_id_b = ? AND clusters.score > ? order by clusters.score desc limit ? ''' db = SQLite3().cursor() m = [r for r in db.execute(q1, (pin_id, threshold, n))] +\ [r for r in db.execute(q2, (pin_id, threshold, n))] m = list(set(m)) m.sort() m.reverse() return m[:n]
def exists(self): data = (self.data['pin_id'],) db = SQLite3().cursor() db.execute("select count(*) from pins where pin_id=?", data) return db.fetchone()[0]
def __init__(self): self.db = SQLite3()
def exists(self): data = (self.data['pin_id_a'], self.data['pin_id_b']) db = SQLite3().cursor() db.execute("select count(*) from clusters where pin_id_a=? and pin_id_b=?", data) return db.fetchone()[0]
def exists(self): data = (self.data['pin_id'], self.data['user'], self.data['word']) db = SQLite3().cursor() db.execute("select count(*) from words where pin_id=? and user=? and word=?", data) return db.fetchone()[0]