def simscore(test_set="score"): ''' Select the embedding scoring highest on the `test_set` word comparison task. Score is calculated by measuring the average Spearman correlation of the word vector cosine similarities and human-rated similarity for each word pair. Missing words are substituted with random words Args: test_set (str,opt): Embedding will be returned with highest score on `test_set` If test_set is not specified, embedding will be returned with highest average score over all similarity tasks (excluding MC-30) Returns: emb_name: Name of Embedding with highest simscore Word Pair Similarity Tasks: * WS-353: Finkelstein et. al, 2002 * MC-30: Miller and Charles, 1991 (Excluded from composite score, already included in SimLex-999 Test Set) * MEN: Bruni et. al, 2012 * MTurk-287: Radinsky et. al, 2011 * MTurk-771: Halawi and Dror, 2012 * Rare-Word: Luong et. al, 2013 * SimLex-999: Hill et. al, 2014 * SimVerb-3500: Gerz et. al, 2016 * Verb-144: Baker et. al, 2014 ''' max_query = "SELECT MAX(" + test_set +") FROM " +info.INDEX_FILE max_simscore = np.floor(100*dw.query(info.INDEXER, max_query).dataframe.iloc[0][0])/100 emb_query = "SELECT embedding_name FROM " + info.INDEX_FILE + " WHERE " +test_set +" >= " + str(max_simscore) top_emb = dw.query(info.INDEXER, emb_query).dataframe return top_emb.iloc[0][0]
def test_toplevel_query(dw_instances, profile): datadotworld.query('agentid/datasetid', 'SELECT * FROM Tables', profile=profile) assert_that( dw_instances[profile].query, called().times(1).with_args(equal_to('agentid/datasetid'), equal_to('SELECT * FROM Tables'), query_type=equal_to('sql')))
def maxtkn(): '''Returns the embedding trained on the most tokens. Args: None Returns: emb_name ''' max_query = "SELECT MAX(token_count) FROM " +info.INDEX_FILE token_count = dw.query(info.INDEXER, max_query).dataframe.iloc[0][0] emb_query = "SELECT embedding_name FROM " + info.INDEX_FILE + " WHERE token_count >= " + str(token_count) top_emb = dw.query(info.INDEXER, emb_query).dataframe return top_emb.iloc[0][0]
def join_population_age_range(df): # Load population data population_query = dw.query( 'fryanpan13/covid-tracking-racial-data', 'SELECT * FROM population_estimates_age_ranges') population_df = population_query.dataframe population_df = population_df.rename( columns={ 'state': COL_STATE, 'state_name': 'State Name', 'age_range': COL_AGE_GROUP, 'race_ethnicity': COL_RACE_ETHNICITY, 'dataset': 'Dataset', 'geo_state_name': 'Geo State Name', 'standard_weight': COL_STANDARD_WEIGHT, 'population': COL_POPULATION }) population_df[COL_POPULATION] = population_df[COL_POPULATION].astype( 'Int64') # Load region data region_query = dw.query('fryanpan13/covid-tracking-racial-data', 'SELECT * FROM regions') region_df = region_query.dataframe region_df = region_df.rename(columns={'state': COL_STATE, 'region': 'Region', 'subregion': 'Subregion'}) \ .drop(columns=['state_name']) population_df = population_df.set_index([COL_STATE]) region_df = region_df.set_index([COL_STATE]) population_df = population_df.merge(region_df, on=[COL_STATE]) population_index = [ COL_DATASET, COL_STATE, COL_AGE_GROUP, COL_RACE_ETHNICITY ] population_df = population_df.reset_index( drop=False).set_index(population_index) # Set expected deaths to a dummy value population_df['Expected Deaths'] = 0 print(population_df.columns) print(df.columns) # Join population return df.join(population_df, population_index)
def query(words, emb_name, set_name=None, case_sensitive=False): """Query a set of word vectors from an indexed embedding. Args: words (List of strings): The set of word vectors being queried emb_name (str): The embedding from which word vectors are being queried set_name (str, opt): Name of dataset being queried (format: owner/id) case_sensitive (bool, opt): Flag for matching exact case in query Returns: Pandas Dataframe, each row specifying a word vector. """ set_name = _error_check(emb_name, set_name) if case_sensitive: title = 'text' else: words = [word.lower() for word in words] title = 'lower(text)' try: if len(words) > 1: query = 'SELECT * FROM ' + emb_name + ' where ' + title + ' in' + str( tuple(words)) else: query = 'SELECT * FROM ' + emb_name + ' where ' + title + ' = "' + words[ 0] + '"' results = dw.query(set_name, query).dataframe return results except: raise RuntimeError( "Embedding is formatted improperly. Check headers at: " + query) raise RuntimeError("No matching word vector found.")
def _emb_rank(): query = 'SELECT embedding_name, dataset_name, contributor, embedding_type, dimension, score \ FROM ' + info.INDEX_FILE results = dw.query(info.INDEXER,query).dataframe results = results.nlargest(10, 'score') for ind,row in results.iterrows(): results.loc[ind, 'embedding_name'] = \ "[`"+row['embedding_name']+"`]("+ info.BASE_URL + row['dataset_name']+")" results = results.drop('dataset_name', axis=1) md_table = tabulate(results, headers=list(results), tablefmt="pipe",showindex=False) with io.open('../README.md', 'r', encoding='utf-8') as readme: pre, post = True,False pre_table,post_table = '','' for line in readme: if pre: pre_table += line if line == '[comment]: <> (Leaderboard Start)\n': pre = False if post: post_table+= line if line == '[comment]: <> (Leaderboard End)\n': post_table = line post = True with io.open('../README.md', 'w', encoding='utf-8') as readme: readme.write(pre_table+'\n') readme.write(md_table+'\n\n') readme.write(post_table)
def college_score(name): dataset_key = 'https://data.world/education/university-rankings-2017' try: results = dw.query( dataset_key=dataset_key, query= "SELECT *FROM national_universities_rankings WHERE name = '%s'" % name) df = results.dataframe rank = (df.iloc[0]['rank']) score = 1 if rank <= 5: score = 6 if rank <= 10: score = 5 if rank > 50 & rank < 100: score = 4 if rank > 100 & rank < 150: score = 3 if rank > 150 & rank < 200: score = 2 if rank > 200: score = 1 return score except: print("Your College Isn't In Here!!")
def _error_check(emb_name, set_name=None, vs_format=None): if set_name and vs_format: return set_name, vs_format emb_list = dw.query(info.INDEXER_URL, 'SELECT * FROM ' + info.INDEX_FILE).dataframe emb_names = emb_list.embedding_name if len(emb_list.loc[emb_list['embedding_name'] == emb_name]) > 1 and set_name == None: raise ValueError("More than one embedding exists with name: " + emb_name + "Try again specifying set_name.") if emb_name not in emb_names.values: raise ValueError("No embedding exists with name: " + emb_name) if emb_list.file_format[emb_names == emb_name].iloc[0] not in [ 'csv', 'tsv', 'xls', 'rdf', 'json' ]: raise TypeError( emb_name + " was not uploaded in a queryable format. Try reuploading as: csv, tsv, xls, rdf, or json." ) set_name = emb_list.loc[emb_list['embedding_name'] == emb_name].dataset_name.iloc[0] vs_format = emb_list.loc[emb_list['vs_format'] == vs_format].dataset_name.iloc[0] return set_name, vs_format
def download(emb_name, set_name=None, vs_format=None): '''Loads the full shared embedding `emb_name` and saves the embedding to the current working directory. Args: emb_name(str): Name of the selected embedding set_name(opt, str): Specify if multiple embeddings exist with same name Returns: .csv embedding saved to the current working directory ''' DW_API_TOKEN = os.environ['DW_AUTH_TOKEN'] multiproc = pp.ProcessPool(proc_cnt) def emb_download(appx_num): query_url = "https://query.data.world/file_download/" + set_name + "-appx" + str( appx_num) + "/" + emb_name + "-appx" + str(appx_num) + '.csv' payload, headers = "{}", {'authorization': 'Bearer ' + DW_API_TOKEN} emb_text = requests.request("GET", query_url, data=payload, headers=headers).text with io.open(emb_name + "-appx" + str(appx_num) + '.csv', 'wb') as download_emb: download_emb.write(emb_text) set_name, vs_format = _error_check(emb_name, set_name=set_name, vs_format=vs_format) if vs_format == 'large': num_appx = "SELECT app_num FROM " + info.INDEX_FILE + " WHERE embedding_name = " + emb_name + " and dataset_name = " + set_name app_count = dw.query(info.INDEXER, num_appx).dataframe.iloc[0][0] multiproc.map(emb_download, list(range(num_appx))) with io.open(emb_name + '.csv', 'wb') as compiled: first_appx = io.open(emb_name + '-appx0.csv', 'r', encoding='utf-8') compiled.write(first_appx.read()) for i in range(1, app_count): with io.open(emb_name + "-appx" + str(i) + '.csv', 'r', encoding='utf-8') as appx: appx.next() for line in appx: compiled.write(line) os.remove(emb_name + "-appx" + str(i) + '.csv') else: query_url = "https://query.data.world/file_download/" + set_name + "/" + emb_name + '.csv' payload, headers = "{}", {'authorization': 'Bearer ' + DW_API_TOKEN} emb_text = requests.request("GET", query_url, data=payload, headers=headers).text with io.open(emb_name + '.csv', 'wb') as download_emb: download_emb.write(emb_text) return pd.read_csv(emb_name + '.csv')
def query_dataworld(): save_name = 'raw/dataworld/all_'+str(datetime.now().date()) if os.path.exists('data/'+save_name+'.pkl.tgz'): print('data found in',save_name) return pd.read_pickle('data/'+save_name+'.pkl.tgz') else: results = dw.query('covid-19-data-resource-hub/covid-19-case-counts', 'SELECT * FROM covid_19_cases') df = results.dataframe save(df,name='raw/dataworld/all_'+str(datetime.now().date())) return df
def check(): """Displays indexed word embeddings and associated metadata. Args: NONE Returns: Pandas Dataframe containing avaiable word embeddings and metadata """ embedding_list = dw.query(info.INDEXER_URL, 'SELECT * FROM ' + info.INDEX_FILE) df = embedding_list.dataframe cols = df.columns.tolist() title = ["embedding_name", "dataset_name", "contributor"] return df[title + [field for field in cols if field not in title]]
# datadotworld module has been imported as dw import datadotworld as dw # We've written a SPARQL query for you and assigned it to the `sparql_query` variable: sparql_query = "PREFIX GOT: <https://tutorial.linked.data.world/d/sparqltutorial/> SELECT ?FName ?LName WHERE {?person GOT:col-got-house \"Stark\" . ?person GOT:col-got-fname ?FName . ?person GOT:col-got-lname ?LName .}" # Use the pre-defined SPARQL query to query dataset http://data.world/tutorial/sparqltutorial and return the results to a queryResults variable queryResults = dw.query('http://data.world/tutorial/sparqltutorial', sparql_query, query_type='sparql') # Use the dataframe property of the resulting query to create a dataframe variable named `houseStark` houseStark = queryResults.dataframe # Use pp.pprint() to print the dataframe to the screen. pp.pprint(houseStark)
import os import geoviews as gv from cartopy import crs as ccrs gv.extension('bokeh', 'matplotlib') # Define project root directory # ROOT_DIR = Path(__file__).parent.parent ROOT_DIR = Path(os.getcwd()).parent # %% [markdown] # We next query and download the latest covid data from data.world servers. # %% # Get latest covid data results = dw.query('covid-19-data-resource-hub/covid-19-case-counts', 'SELECT * FROM covid_19_cases') results_df = results.dataframe.drop(['prep_flow_runtime', 'difference'], axis="columns") results_df = (results_df[results_df["table_names"] == "Time Series"].drop( ["table_names"], axis='columns')) results_df['date'] = pd.to_datetime(results_df['date']) # %% ### US STATISTICS (county level) # Import shapefile of US states and states_shp = gpd.read_file( ROOT_DIR.joinpath("data/tl_2017_us_states/tl_2017_us_state.shp")) states_shp = states_shp[["NAME", "geometry"]].rename(columns={'NAME': 'State'})
import datadotworld as d td = intro_dataset = d.load_dataset('rfabbri/test1') q = """SELECT * WHERE {?s ?p ?o}""" q2 = """ PREFIX po: <http://purl.org/socialparticipation/po/> SELECT ?s WHERE {?s a po:Participant} """ r = d.query('rfabbri/test1', q2, query_type='sparql')
def download_dataframe(): query = """SELECT does_this_tweet_contain_hate_speech, tweet_text FROM twitter_hate_speech_classifier_dfe_a845520""" result = dw.query('crowdflower/hate-speech-identification', query) df = result.dataframe return df
# datadotworld module has been imported as dw import datadotworld as dw ## Complete the SQL query to select state, the count of farmers markets (fmid), and average obesity rate from agriculture.`national-farmers-markets`.export, LEFT JOINED against health.`obesity-by-state-2014`.adult_obese on state and location sql_query = "SELECT state, count(fmid) as count, Avg(obesity.Value) as obesityAvg FROM Export LEFT JOIN health.`obesity-by-state-2014`.`adult_obese` as obesity ON state = obesity.location GROUP BY state ORDER BY count desc" ## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/agriculture/national-farmers-markets` dataset. Assign the results to a `queryResults` variable. queryResults = dw.query('https://data.world/agriculture/national-farmers-markets', sql_query) ## Use the dataframes property of the resulting query to create a dataframe variable named `stateStats` stateStats = queryResults.dataframe ## Plot the stateStats results using state as the x-axis (matplotlib is already imported) stateStats.plot(x='state') plt.show()
def join_population_all(df): # Load population data population_query = dw.query('fryanpan13/covid-tracking-racial-data', 'SELECT * FROM population_data') population_df = population_query.dataframe population_df = population_df.rename( columns={ 'state': COL_STATE, 'state_name': 'State Name', 'race': COL_RACE_ETHNICITY, 'dataset': 'Dataset', 'geo_state_name': 'Geo State Name', 'population': COL_POPULATION }) population_df[COL_POPULATION] = population_df[COL_POPULATION].astype( 'Int64') print('unknown ethnicity data') print(population_df[population_df[COL_RACE_ETHNICITY] == 'Unknown Ethnicity']) # Load region data region_query = dw.query('fryanpan13/covid-tracking-racial-data', 'SELECT * FROM regions') region_df = region_query.dataframe region_df = region_df.rename(columns={'state': COL_STATE, 'region': 'Region', 'subregion': 'Subregion'}) \ .drop(columns=['state_name']) # Load expected deaths data expected_deaths_query = dw.query('fryanpan13/covid-tracking-racial-data', 'SELECT * FROM expected_deaths') expected_deaths_df = expected_deaths_query.dataframe[[ 'state', 'race', 'dataset', 'expected_deaths' ]] expected_deaths_df = expected_deaths_df.rename( columns={ 'state': COL_STATE, 'dataset': COL_DATASET, 'race': COL_RACE_ETHNICITY, 'expected_deaths': 'Expected Deaths' }) expected_deaths_df['Expected Deaths'] = expected_deaths_df[ 'Expected Deaths'].astype('float') population_df = population_df.set_index([COL_STATE]) region_df = region_df.set_index([COL_STATE]) population_df = population_df.merge(region_df, on=[COL_STATE]) population_index = [COL_DATASET, COL_STATE, COL_RACE_ETHNICITY] population_df = population_df.reset_index( drop=False).set_index(population_index) expected_deaths_df = expected_deaths_df.reset_index( drop=False).set_index(population_index) population_df = population_df.merge(expected_deaths_df, on=population_index) print(population_df.columns) print(df.columns) # Join population return df.join(population_df, population_index)
# datadotworld module has been imported as dw import datadotworld as dw ## Complete the SQL query to select all rows from the `unhcr_all` table where `Year` equals 2010. Assign the query string to a `sql_query` variable. sql_query = "SELECT * FROM `unhcr_all` WHERE Year = 2010" ## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/nrippner/refugee-host-nations` dataset. Assign the results to a `query2010` variable. query2010 = dw.query("https://data.world/nrippner/refugee-host-nations", sql_query) ## Use the dataframe property of the resulting query to create a dataframe variable named `unhcr2010` unhcr2010 = query2010.dataframe ## Print the first 5 rows using the head method. pp.pprint(unhcr2010.head(5))
# userid: berkj # Email: [email protected] # Assignment Number: assignment1 # Honor statement: I pledge on my honor that I have neither given nor # received unauthorized aid on this assignment. # Exercise 8: # datadotworld module has been imported as dw import datadotworld as dw # We've written a SPARQL query for you and assigned it to the `sparql_query` variable: sparql_query = "PREFIX GOT: <https://tutorial.linked.data.world/d/sparqltutorial/> SELECT ?FName ?LName WHERE {?person GOT:col-got-house \"Stark\" . ?person GOT:col-got-fname ?FName . ?person GOT:col-got-lname ?LName .}" # Use the pre-defined SPARQL query to query dataset http://data.world/tutorial/sparqltutorial and return the results to a queryResults variable queryResults = dw.query('http://data.world/tutorial/sparqltutorial', sparql_query, query_type='sparql') # Use the dataframe property of the resulting query to create a dataframe variable named `houseStark` houseStark = queryResults.dataframe # Use pp.pprint() to print the dataframe to the screen. pp.pprint(houseStark)
def get_state_reporting_category(): state_reporting_df = dw.query('fryanpan13/covid-tracking-racial-data', 'SELECT * FROM state_reporting_category') return state_reporting_df.dataframe
def get_age_ranges(): # Load standard weights by age range age_range_query = dw.query('fryanpan13/covid-tracking-racial-data', 'SELECT * FROM standard_population_weights') return age_range_query.dataframe
email : [email protected] [email protected] DataDotWorldBBallStats.name pointspergame assistspergame \ 0 Jon 20.4 1.3 1 Rob 15.5 8.0 2 Sharon 30.1 11.2 3 Alex 8.2 0.5 4 Rebecca 12.3 17.0 5 Ariane 18.1 3.0 6 Bryon 16.0 8.5 7 Matt 13.0 2.1 DataDotWorldBBallTeam.name height handedness 0 Jon 6'5" Right 1 Rob 6'7.5" Left 2 Sharon 6'3" Right 3 Alex 6'2" Right 4 Rebecca 7' Right 5 Ariane 5'8" Left 6 Bryon 7' Right 7 Matt 5'5" Right """ import datadotworld as dw name='jonloyens/an-intro-to-dataworld-dataset' results = dw.query(name, 'SELECT * FROM `DataDotWorldBBallStats`, `DataDotWorldBBallTeam` ' 'WHERE DataDotWorldBBallTeam.Name = DataDotWorldBBallStats.Name') df = results.dataframe print(df)
def refresh(force_update=False): ''' Crawls for new embeddings with the tag and update the index file with new embedding sets, or changes to existing shared embeddings. Args: force_update(bool, opt): Hard reset, re-index ALL available embeddings. If False, only scrape metadata or new embedding sets. Returns: None. Uploads new index_file.csv to indexer on data store. ''' # Retrieve source for data.world:vecshare search results display = Display(visible=0, size=(800, 600)) display.start() wd= webdriver.Firefox(executable_path="/usr/bin/firefox", capabilities= {"marionette": False }) page_num, set_count, sets = 1, 1000, [] while set_count > len(sets): wd.get(info.DATASETS_URL + "?page="+str(page_num)) try: WebDriverWait(wd,5).until(EC.visibility_of_element_located((By.CLASS_NAME, info.DW_CLASS_TAG))) except: pass soup = BeautifulSoup(wd.page_source, 'lxml') set_txt = soup.find('h1','TopicView__headline___2_0-1').text set_count = [int(s) for s in set_txt.split() if s.isdigit()][0] sets.extend([s["href"][1:] for s in soup.find_all('a', info.DW_CLASS_TAG)]) page_num += 1 dw_api = dw.api_client() wd.close() print ("Found " + str(len(sets)) + " sets with the " + info.EMB_TAG + " tag.") embeddings, prev_indexed, updated = [], [], False if not force_update: prev_query = dw.query(info.INDEXER, 'SELECT dataset_name, embedding_name FROM '+ info.INDEX_FILE).dataframe for ind, row in prev_query.iterrows(): prev_indexed.append("/".join(row.values)) for set_name in sets: curr_set = dw.load_dataset(set_name,force_update = True) # Embedding curr_meta = dw_api.get_dataset(set_name) set_updated = parse(curr_meta['updated']) meta_dict = dict() contrib = curr_meta["owner"] resources = curr_set.describe()['resources'] summary = StringIO(curr_meta["summary"]) for line in summary: for field in line.split(","): for sent in field.split("."): try: meta_field = field.split(":") if len(meta_field) == 2: meta_dict[meta_field[0].strip().lower().replace(" ", "_").replace("-", "_")] = meta_field[1].strip() except: pass for each in curr_meta['files']: emb_name = each['name'][:-4] emb_updated = parse(each['updated']) try: ind_query = 'SELECT last_updated FROM '+ info.INDEX_FILE + \ ' WHERE dataset_name = "'+ set_name +'" and embedding_name = "'+emb_name+'"' query_results = dw.query(info.INDEXER, ind_query).dataframe.iloc[0].values[0] last_indexed = parse(query_results) if emb_updated > set_updated: last_updated = emb_updated else: last_updated = set_updated except: last_updated = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) last_indexed = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) pass # Index if new embedding or if metadata/embedding updated since last Index if (force_update) or (set_name + '/' + emb_name not in prev_indexed) or (last_indexed < last_updated) : try: curr_emb = curr_set.describe(emb_name.lower()) except: continue updated = True emb_dim = len(curr_emb['schema']['fields']) - 1 file_format = curr_emb['format'] try: vocab_size = dw.query(set_name , "SELECT COUNT(text) FROM " + emb_name).dataframe.iloc[0][0] except: vocab_size = "" emb_simset = vecshare.extract(emb_name,'sim_vocab', set_name=set_name, case_sensitive=True,progress=False) score_dict = sim_benchmark._eval_all(emb_simset) temp_0 ='original/'+emb_name.lower()+'.csv' temp_1 =emb_name.lower() for d in resources: if d['name'] == temp_0: try: description = StringIO(d['description']) for line in description: for sent in line.split("."): for field in sent.split(","): meta_field = field.split(":") if len(meta_field) == 2: meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip() except: pass if d['name'] == temp_1: try: description = StringIO(d['description']) for line in description: for sent in line.split('.'): for field in sent.split(","): meta_field = field.split(":") if len(meta_field) == 2: meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip() except: pass print ("Newly Indexed embedding: " + emb_name+ " from dataset " + set_name + ".") meta_dict.update(score_dict) meta_dict.update({ u'embedding_name': emb_name, u"dataset_name": set_name, u"contributor":contrib, u"dimension":emb_dim, u"vocab_size":vocab_size, u"file_format":file_format, u"last_updated": last_updated}) embeddings.append(deepcopy(meta_dict)) else: print ("Re-indexed embedding: " + emb_name+ " from dataset " + set_name + ".") query = 'SELECT * FROM '+ info.INDEX_FILE + ' WHERE dataset_name = "'+ \ set_name +'" and embedding_name = "'+ emb_name +'"' prev_row = dw.query(info.INDEXER, query).dataframe embeddings.extend(prev_row.to_dict(orient='records')) with io.open(info.INDEX_FILE_PATH, 'w', encoding="utf-8") as ind: meta_header = set().union(*embeddings) csv_writer = csv.DictWriter(ind, fieldnames = meta_header) csv_writer.writeheader() for emb in embeddings: csv_writer.writerow(emb) print ("Updating index file at " + info.INDEXER_URL) dw_api.upload_files(info.INDEXER, info.INDEX_FILE_PATH) if updated: #_emb_rank() print ("Updating avg_rank signatures") avgrank_refresh() return updated else: return False
# userid: berkj # Email: [email protected] # Assignment Number: assignment1 # Honor statement: I pledge on my honor that I have neither given nor # received unauthorized aid on this assignment. # Exercise 6: # datadotworld module has been imported as dw import datadotworld as dw ## Complete the SQL query to select all rows from the `unhcr_all` table where `Year` equals 2010. Assign the query string to a `sql_query` variable. sql_query = "SELECT * FROM `unhcr_all` WHERE Year = 2010" ## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/nrippner/refugee-host-nations` dataset. Assign the results to a `query2010` variable. query2010 = dw.query('https://data.world/nrippner/refugee-host-nations', sql_query) ## Use the dataframe property of the resulting query to create a dataframe variable named `unhcr2010` unhcr2010 = query2010.dataframe ## Print the first 5 rows using the head method. pp.pprint(unhcr2010.head(5))
def query(words, emb_name, set_name=None, case_sensitive=False, download=False, vs_format=None): """Query a set of word vectors from an indexed embedding. Args: words (List of strings): The set of word vectors being queried emb_name (str): The embedding from which word vectors are being queried set_name (str, opt): Name of dataset being queried (format: owner/id) case_sensitive (bool, opt): Flag for matching exact case in query download(bool,opt): Flag for saving query results Returns: Pandas Dataframe, each row specifying a word vector. """ # Set names should have '-' and file names should use "_" def partial_query(args): return dw.query(args[0], args[1]).dataframe if case_sensitive: title = 'text' else: words = [word.lower() for word in words] title = 'lower(text)' set_name, vs_format = _error_check(emb_name, set_name=set_name, vs_format=vs_format) query_list, proc_cnt = [], 16 ind_results, combined_vecs = pd.DataFrame(), pd.DataFrame() multiproc = pp.ProcessPool(proc_cnt) if vs_format == 'large': try: ind_query = 'SELECT * FROM ' + emb_name + ' where ' + title if len(words) == 1: cond = '"' + words[0] + '"' ind_results = dw.query(set_name, ind_query + cond).dataframe else: for i in range(0, len(words), 400): query_words = words[i:i + 400] query_list.append([ set_name, ind_query + ' in' + str(tuple(query_words)) ]) word_index = multiproc.map(partial_query, query_list) word_index = [word for word in word_index] for each in word_index: ind_results = ind_results.append(each) except: RuntimeError( "Embedding is formatted improperly. Check upload at: " + set_name) num_appx = "SELECT app_num FROM " + info.INDEX_FILE + " WHERE embedding_name = " + emb_name + " and dataset_name = " + set_name app_count = dw.query(info.INDEXER, num_appx).dataframe.iloc[0][0] query_list = [] if vs_format == 'large': for each in range(app_count): base_query = 'SELECT * FROM ' + emb_name.lower() + "_appx" + str( each) + ' where ' + title ind_appcnt = (ind_results[(ind_results['app_setname'] == set_name + "-appx" + str(each))])['text'] for i in range(0, len(ind_appcnt), 400): if len(words) > 1: cond = ' in' + str(tuple(words[i:i + 400])) else: cond = ' = "' + words[0] + '"' query_list.append( [set_name + "-appx" + str(each), base_query + cond]) else: base_query = 'SELECT * FROM ' + emb_name.lower().replace( '-', "_") + ' where ' + title for i in range(0, len(words), 400): if len(words) > 1: cond = ' in' + str(tuple(words[i:i + 400])) else: cond = ' = "' + words[0] + '"' query_list.append([set_name, base_query + cond]) try: word_vecs = multiproc.map(partial_query, query_list) except: import pdb pdb.set_trace() RuntimeError("Improper Query: " + query_list[0]) for each in word_vecs: combined_vecs = combined_vecs.append(each) if download == True: with io.open(emb_name + '_query.csv', 'wb', encoding='utf-8') as extract_csv: extract_emb.to_csv(extract_csv, encoding='utf-8', index=False) multiproc.terminate() return combined_vecs
from pymake.utils.common.prettymessaging import PrettyMessaging import datadotworld pm = PrettyMessaging('Workshop-Spark') results = datadotworld.query('jrm/traffic-violations', "SELECT * FROM traffic_violations") results_df = results.dataframe pm.print_dict(results_df) results_df.to_csv('data/raw.csv', index=False)
def partial_query(args): return dw.query(args[0], args[1]).dataframe
how='left', left_on='state', right_on='stusab') ## Add a 'citystate' column to the merged_dataframe dataframe, populating it with the concatinated values from the 'city' and 'state_name' columns, separated by ', '. merged_dataframe['citystate'] = merged_dataframe[ 'city'] + ',' + merged_dataframe['state_name'] ## Print first 5 rows of merged_dataframe pp.pprint(merged_dataframe.head()) ## Complete the SQL query to select all rows from the `unhcr_all` table where `Year` equals 2010. Assign the query string to a `sql_query` variable. sql_query = "SELECT * FROM `unhcr_all` WHERE Year = 2010" ## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/nrippner/refugee-host-nations` dataset. Assign the results to a `query2010` variable. query2010 = dw.query('https://data.world/nrippner/refugee-host-nations', query=sql_query) ## Use the dataframe property of the resulting query to create a dataframe variable named `unhcr2010` unhcr2010 = query2010.dataframe ## Print the first 5 rows using the head method. pp.pprint(unhcr2010.head()) ## Complete the SQL query to select state, the count of farmers markets (fmid), and average obesity rate from agriculture.`national-farmers-markets`.export, LEFT JOINED against health.`obesity-by-state-2014`.adult_obese on state and location sql_query = "SELECT state, count(FMID) as count, Avg(obesity.value) as obesityAvg FROM export LEFT JOIN health.`obesity-by-state-2014`.`adult_obese` as obesity ON state = obesity.location GROUP BY state ORDER BY count desc" ## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/agriculture/national-farmers-markets` dataset. Assign the results to a `queryResults` variable. queryResults = dw.query( "https://data.world/agriculture/national-farmers-markets", sql_query) ## Use the dataframes property of the resulting query to create a dataframe variable named `stateStats`
# userid: berkj # Email: [email protected] # Assignment Number: assignment1 # Honor statement: I pledge on my honor that I have neither given nor # received unauthorized aid on this assignment. # Exercise 6: # datadotworld module has been imported as dw import datadotworld as dw ## Complete the SQL query to select state, the count of farmers markets (fmid), and average obesity rate from agriculture.`national-farmers-markets`.export, LEFT JOINED against health.`obesity-by-state-2014`.adult_obese on state and location sql_query = "SELECT state, count(fmid) as count, Avg(obesity.Value) as obesityAvg FROM Export LEFT JOIN health.`obesity-by-state-2014`.`adult_obese` as obesity ON state = obesity.location GROUP BY state ORDER BY count desc" ## Use the `query` method of the datadotworld module to run the `sql_query` against the `https://data.world/agriculture/national-farmers-markets` dataset. Assign the results to a `queryResults` variable. queryResults = dw.query('https://data.world/agriculture/national-farmers-markets', sql_query) ## Use the dataframes property of the resulting query to create a dataframe variable named `stateStats` stateStats = queryResults.dataframe ## Plot the stateStats results using state as the x-axis (matplotlib is already imported) stateStats.plot(x='state') plt.show()
import datetime from datetime import date from datetime import timedelta from plotnine import * # open and format data world sql query. file = open("C:/Users/Riesser/Documents/python_popup/dw_pull.txt") line = file.read().replace("\n", " ") #remove white space and create a string value line = ' '.join(line.split()) file.close() #run query with "line" variable results = dw.query('associatedpress/johns-hopkins-coronavirus-case-tracker', line) df = results.dataframe #read temp file with data for testing # df = pd.read_csv('temp.csv') # df = df.drop('Unnamed: 0', 1) #create variable and column for today's date today = date.today() df['recorded_date'] = today old_data_date = today - datetime.timedelta(days=2) # append dataset into older data df.to_csv('C:/Users/Riesser/Documents/python_popup/df_storage.csv', mode='a',