def deleteDataWorld(tbl_def): ''' Removes table from data.world tbl_def is { "owner_id": DW_USER, "dw_title": table_name, "gh_url": GH_URL + table_name, "visibility": "OPEN", "license": "Public Domain", "files": {table_name + '.csv': {"url": GH_URL + table_name + '.csv'}}, "dw_url": DW_DB_URL + table_name + '.csv', "dataset_id": DW_USER + "/" + table_name } ''' dw.api_client().delete_dataset(tbl_def["dw_dataset_id"])
def sync_dataset(DATASET_URL=DATASET_URL): sys.stdout.write("\n> Syncing files at: https://data.world/" + DATASET_URL + " -> ") with Spinner(): api_client = dw.api_client() api_client.sync_files(DATASET_URL) print("\n")
def upload(set_name, emb_path="", metadata={}, summary=None): '''Upload a new embedding or update files and associated metadata. Args: set_name (str): Name of the dataset being created (format: owner/id) emb_path (str): Absolute path to local embedding metadata (dict, opt): Dictionary in the format '{metadata field: value}' summary (str, opt): Optional description of embedding and source Returns: None (Create a new/updated data.world dataset with the shared embedding) ''' if os.path.getsize(emb_path) > 1000000000: raise ValueError( "Uploads only supported for embeddings up to 1GB. Consider reducing file size with vecshare.format()." ) dw_api = dw.api_client() metadata_str = "" for key, val in metadata.items(): metadata_str += str(key) + ":" + str(val) + ", " try: usr_name, title = set_name.split("/") dw_api.create_dataset(usr_name, title = title, summary = metadata_str,\ description = summary ,license = 'Public Domain', tags = ['vecshare'], visibility = 'OPEN') except: dw_api.update_dataset(set_name, summary=metadata_str, description=summary) if emb_path: dw_api.upload_files(set_name, [emb_path])
def send_to_dw(doc): client = dw.api_client() username = '******' title = doc.find_first_value("Root.Title") key = join(username, slugify(title)) d = dict(title=doc.find_first_value("Root.Title"), description=doc.find_first_value("Root.Description"), summary=doc.markdown, visibility='OPEN', files=get_resource_urls(doc)) try: ds = client.get_dataset( key) # Raise an error if the dataset does not exist ds = client.replace_dataset(key, **d) ds = client.get_dataset(key) except RestApiError: ds = client.create_dataset('ericbusboom', **d) ds = client.get_dataset(key)
def avgrank_refresh(tolerance = 0.60,sig_cnt = 5000,stopword_cnt = 100): ''' If there are changes to the set of shared embeddings, refresh the AvgRank signature. Generate a set of at most `stopword_cnt` stopwords that occur in at least `tolerance` * emb_cnt embeddings. Generate signatures for the embeddings using the `sig_cnt` most common remaining words. Args: tolerance (float): Frequency at which a stopword must occur sig_cnt (int): Size of AvgRank signature vocab_size stopword_cnt (int): Max number of stopwords Returns: None. Uploads new ar_sig.txt (serialized signatures) to data store. ''' stopwords, emb_vocab, signatures = [],{}, {} DW_API_TOKEN = os.environ['DW_AUTH_TOKEN'] #emb_list = dw.query(info.INDEXER, 'SELECT embedding_name, dataset_name FROM ' + info.INDEX_FILE).dataframe emb_list = pd.read_csv(info.INDEX_FILE_PATH) threshold = int(0.5 + tolerance * emb_list.shape[0]) for ind, row in emb_list.iterrows(): if row['vs_format'] == 'large': emb_name, set_name = row['embedding_name']+"-appx0", row['dataset_name']+"-appx0" else: emb_name, set_name = row['embedding_name'], row['dataset_name'] query_url = "https://query.data.world/file_download/"+set_name+"/"+ emb_name + '.csv' payload, headers = "{}", {'authorization': 'Bearer '+ DW_API_TOKEN} emb_text = StringIO(requests.request("GET", query_url, data=payload, headers=headers).text) emb_df = pd.read_csv(emb_text, nrows = 1.5 *sig_cnt) wordlist = emb_df.iloc[0:2*stopword_cnt,0].values signatures.update({emb_name: emb_df.iloc[:,0].values}) for word in wordlist: word = str(word).lower() if (word not in emb_vocab): emb_vocab.update({word: 1}) else:emb_vocab[word] += 1 stopwords.extend(list(string.digits)) stopwords.extend(list(string.punctuation)) for key in emb_vocab: if (emb_vocab[key] >= threshold): stopwords.append(key) for emb_name, emb_sig in signatures.items(): emb_sig = emb_sig.tolist() for word in stopwords: if word in emb_sig: emb_sig.remove(word) emb_sig = emb_sig[:sig_cnt] print ("Generated AvgRank signature for: " + emb_name) signatures.update({emb_name:emb_sig}) signatures.update({'stopwords':stopwords}) pickle.dump(signatures, io.open(info.AR_SIG_PATH, "wb")) dw_api = dw.api_client() print ("Uploading AvgRank signatures") dw_api.upload_files(info.SIGNATURES, info.AR_SIG_PATH)
def updateDataWorld(): ''' Takes a csv file and imports it into dataworld tbl_def is { "owner_id": DW_USER, "dw_title": table_name, "gh_url": GH_URL + table_name, "visibility": "OPEN", "license": "Public Domain", "files": {table_name + '.csv': {"url": GH_URL + table_name + '.csv'}}, "dw_url": DW_DB_URL + table_name + '.csv', "dw_dataset_key": DW_USER + "/" + table_name } ''' # owner_id=tbl_def["owner_id"], dw.api_client().update_dataset(title=tbl_def["dw_title"], visibility=tbl_def["visibility"], license=tbl_def['license'], files=tbl_def["files"])
def loadDataWorld(tbl_def): ''' Takes a csv file and imports it into dataworld tbl_def is { "owner_id": DW_USER, "dw_title": table_name, "gh_url": GH_URL + table_name, "visibility": "OPEN", "license": "Public Domain", "files": {table_name + '.csv': {"url": GH_URL + table_name + '.csv'}}, "dw_url": DW_DB_URL + table_name + '.csv' } ''' # api_client.create_dataset( dw.api_client().create_dataset(owner_id=tbl_def["owner_id"], title=tbl_def["dw_title"], description=tbl_def["dw_desc"], visibility=tbl_def["visibility"], license=tbl_def['license'], files=tbl_def["files"])
def package_info(doc): client = dw.api_client() username = '******' title = doc.find_first_value("Root.Title") key = join(username, slugify(title)) try: ds = client.get_dataset(key) prt(json.dumps(ds, indent=4)) except RestApiError as e: err(e)
def submit_online_status(status_list: List[models.Status]): """ After online status per every endpoint has been ascertained, submit that information to data.world Stream API. """ logger.info('Submitting results of check to data.world...') api_client: RestApiClient = dw.api_client() for status in status_list: # FIXME we get error 429 here and I haven't found a way to send # multiple records at a time. time.sleep(1) api_client.append_records( dataset_key=settings.DATADOTWORLD['dataset'], stream_id=settings.DATADOTWORLD['status-stream'], body=dataclasses.asdict(status)) return status_list
def upload(set_name, emb_path="", metadata={}, summary="", sep=","): '''Upload a new embedding or update files and associated metadata. Args: set_name (str): Name of the dataset being created (format: owner/id) emb_path (str): Absolute path to local embedding metadata (dict, opt): Dictionary in the format '{metadata field: value}' summary (str, opt): Optional description of embedding and source Returns: None (Create a new/updated data.world dataset with the shared embedding) ''' dw_api = dw.api_client() set_name = set_name.replace(' ', '-').replace('_', '-') metadata_str, dimensions, app_num = "", 0, 0 usr_name, title = set_name.split("/") emb_name = os.path.basename(emb_path) for key, val in metadata.items(): metadata_str += str(key) + ":" + str(val) + ", " with io.open(emb_path, 'r', encoding='utf-8') as f: first_row = f.readline().split(sep) header = ['text'] header.extend([u"d" + str(n) for n in range(len(first_row) - 1)]) if os.path.getsize(emb_path) > 1E9 or True: emb_reader = pd.read_csv(emb_path, chunksize=4E5, names=header, encoding='utf-8', sep=sep) index_df = pd.DataFrame() for app_num, emb_chunk in enumerate(emb_reader): app_title = emb_name[:-4].lower().replace(' ', '-').replace( '_', '-') + "-appx" + str(app_num) app_setname = usr_name + "/" + app_title app_fname = app_title + ".csv" words = emb_chunk.ix[:, 0].reset_index(drop=True) app_sets = pd.Series(app_setname, index=np.arange(len(emb_chunk)), name="app_setname") app_file = pd.Series(app_fname, index=np.arange(len(emb_chunk)), name="app_fname") tmp_df = pd.concat((words, app_sets, app_file), axis=1, copy=False) index_df = index_df.append(tmp_df, ignore_index=True) emb_chunk = emb_chunk.round(4) try: dw_api.create_dataset(usr_name, title = app_title, description = summary,\ license = 'Public Domain', tags = ['vecshare appx'], visibility = 'OPEN') except: dw_api.update_dataset(app_setname, description=summary) with dw.open_remote_file(app_setname, app_fname, mode='wb') as app: emb_chunk.to_csv(app, index=False, mode='wb', encoding='utf-8') try: metadata_str += "app_num:" + str(app_num + 1) + ",vs_format:large" dw_api.create_dataset(usr_name, title = title, summary = metadata_str, description = summary,\ license = 'Public Domain', tags = ['vecshare large'], visibility = 'OPEN') except: dw_api.update_dataset( usr_name + '/' + title.lower().replace(' ', '-').replace('_', '-'), summary=metadata_str, description=summary) with dw.open_remote_file(set_name.lower().replace(' ', '-').replace( '_', '-'), emb_name, mode='wb') as index: index_df.to_csv(index, index=False, mode='wb', encoding='utf-8') else: emb = pd.read_csv(emb_path, names=header, encoding='utf-8', sep=sep) try: metadata_str += "app_num:" + str(1) + ",vs_format:small" dw_api.create_dataset(usr_name, title = title, summary = metadata_str, description = summary,\ license = 'Public Domain', tags = ['vecshare small'], visibility = 'OPEN') except: dw_api.update_dataset(set_name, summary=metadata_str, description=summary) with dw.open_remote_file(set_name, emb_name, mode='wb') as index: index_df.to_csv(index, index=False, mode='wb', encoding='utf-8')
import datadotworld as dw import os os.environ['DW_AUTH_TOKEN'] = ( 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiJwcm9kLXVzZXItY2xpZW50OnRyaWxvZ3l' 'lZCIsImlzcyI6ImFnZW50OnRyaWxvZ3llZDo6ZTVkMTBkNDgtODRmYy00ZTVjLTk' 'zNTUtMGIwOGMzYjIxNGNlIiwiaWF0IjoxNTAzMTAxNDIzLCJyb2xlIjpbInVzZXJ' 'fYXBpX3dyaXRlIiwidXNlcl9hcGlfcmVhZCJdLCJnZW5lcmFsLXB1cnBvc2UiOnR' 'ydWV9.HpopfqxXh0VqNgb1b8tpP6G1bkr-WblRNeS3UlhF-05sSTxx1CHJgRuAjd' 'nP8MoBIsHsysJANP27ioXqCKChgw') url = 'trilogyed/dataviz-unit-11-hwk' download_dir = 'Resources' if os.path.isdir(download_dir): print("Resources Directory Already Exists!") print("Please Remove the existing Resources folder and re-run this script") exit() client = dw.api_client() print("Downloading Data...") client.download_datapackage(url, download_dir) print("Download Complete!")
def test_toplevel_api_client(dw_instances, profile): assert_that(datadotworld.api_client(), equal_to(dw_instances[profile].api_client))
def refresh(force_update=False): ''' Crawls for new embeddings with the tag and update the index file with new embedding sets, or changes to existing shared embeddings. Args: force_update(bool, opt): Hard reset, re-index ALL available embeddings. If False, only scrape metadata or new embedding sets. Returns: None. Uploads new index_file.csv to indexer on data store. ''' # Retrieve source for data.world:vecshare search results display = Display(visible=0, size=(800, 600)) display.start() wd= webdriver.Firefox(executable_path="/usr/bin/firefox", capabilities= {"marionette": False }) page_num, set_count, sets = 1, 1000, [] while set_count > len(sets): wd.get(info.DATASETS_URL + "?page="+str(page_num)) try: WebDriverWait(wd,5).until(EC.visibility_of_element_located((By.CLASS_NAME, info.DW_CLASS_TAG))) except: pass soup = BeautifulSoup(wd.page_source, 'lxml') set_txt = soup.find('h1','TopicView__headline___2_0-1').text set_count = [int(s) for s in set_txt.split() if s.isdigit()][0] sets.extend([s["href"][1:] for s in soup.find_all('a', info.DW_CLASS_TAG)]) page_num += 1 dw_api = dw.api_client() wd.close() print ("Found " + str(len(sets)) + " sets with the " + info.EMB_TAG + " tag.") embeddings, prev_indexed, updated = [], [], False if not force_update: prev_query = dw.query(info.INDEXER, 'SELECT dataset_name, embedding_name FROM '+ info.INDEX_FILE).dataframe for ind, row in prev_query.iterrows(): prev_indexed.append("/".join(row.values)) for set_name in sets: curr_set = dw.load_dataset(set_name,force_update = True) # Embedding curr_meta = dw_api.get_dataset(set_name) set_updated = parse(curr_meta['updated']) meta_dict = dict() contrib = curr_meta["owner"] resources = curr_set.describe()['resources'] summary = StringIO(curr_meta["summary"]) for line in summary: for field in line.split(","): for sent in field.split("."): try: meta_field = field.split(":") if len(meta_field) == 2: meta_dict[meta_field[0].strip().lower().replace(" ", "_").replace("-", "_")] = meta_field[1].strip() except: pass for each in curr_meta['files']: emb_name = each['name'][:-4] emb_updated = parse(each['updated']) try: ind_query = 'SELECT last_updated FROM '+ info.INDEX_FILE + \ ' WHERE dataset_name = "'+ set_name +'" and embedding_name = "'+emb_name+'"' query_results = dw.query(info.INDEXER, ind_query).dataframe.iloc[0].values[0] last_indexed = parse(query_results) if emb_updated > set_updated: last_updated = emb_updated else: last_updated = set_updated except: last_updated = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) last_indexed = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) pass # Index if new embedding or if metadata/embedding updated since last Index if (force_update) or (set_name + '/' + emb_name not in prev_indexed) or (last_indexed < last_updated) : try: curr_emb = curr_set.describe(emb_name.lower()) except: continue updated = True emb_dim = len(curr_emb['schema']['fields']) - 1 file_format = curr_emb['format'] try: vocab_size = dw.query(set_name , "SELECT COUNT(text) FROM " + emb_name).dataframe.iloc[0][0] except: vocab_size = "" emb_simset = vecshare.extract(emb_name,'sim_vocab', set_name=set_name, case_sensitive=True,progress=False) score_dict = sim_benchmark._eval_all(emb_simset) temp_0 ='original/'+emb_name.lower()+'.csv' temp_1 =emb_name.lower() for d in resources: if d['name'] == temp_0: try: description = StringIO(d['description']) for line in description: for sent in line.split("."): for field in sent.split(","): meta_field = field.split(":") if len(meta_field) == 2: meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip() except: pass if d['name'] == temp_1: try: description = StringIO(d['description']) for line in description: for sent in line.split('.'): for field in sent.split(","): meta_field = field.split(":") if len(meta_field) == 2: meta_dict[meta_field[0].strip().lower().replace(" ", "_")] = meta_field[1].strip() except: pass print ("Newly Indexed embedding: " + emb_name+ " from dataset " + set_name + ".") meta_dict.update(score_dict) meta_dict.update({ u'embedding_name': emb_name, u"dataset_name": set_name, u"contributor":contrib, u"dimension":emb_dim, u"vocab_size":vocab_size, u"file_format":file_format, u"last_updated": last_updated}) embeddings.append(deepcopy(meta_dict)) else: print ("Re-indexed embedding: " + emb_name+ " from dataset " + set_name + ".") query = 'SELECT * FROM '+ info.INDEX_FILE + ' WHERE dataset_name = "'+ \ set_name +'" and embedding_name = "'+ emb_name +'"' prev_row = dw.query(info.INDEXER, query).dataframe embeddings.extend(prev_row.to_dict(orient='records')) with io.open(info.INDEX_FILE_PATH, 'w', encoding="utf-8") as ind: meta_header = set().union(*embeddings) csv_writer = csv.DictWriter(ind, fieldnames = meta_header) csv_writer.writeheader() for emb in embeddings: csv_writer.writerow(emb) print ("Updating index file at " + info.INDEXER_URL) dw_api.upload_files(info.INDEXER, info.INDEX_FILE_PATH) if updated: #_emb_rank() print ("Updating avg_rank signatures") avgrank_refresh() return updated else: return False
def save_to_dw(df, filename): file_path = f'/tmp/{filename}' df.to_csv(file_path, index=True) client = dw.api_client() client.upload_files('fryanpan13/covid-tracking-racial-data',files=file_path)