def parse_tableau(file_contents): """ This is a weird blob containing *two* JSON encoded dictionaries. Each is preceded by its length in bytes, but using a regex (instead of counting) is simple enough. Follows the approach in tableau-scraping. See the links below: https://github.com/bertrandmartel/tableau-scraping/blob/9dba25af057ac29f921a75df374943060ab79b0a/tableauscraper/TableauScraper.py#L77-L84 https://github.com/bertrandmartel/tableau-scraping/blob/9dba25af057ac29f921a75df374943060ab79b0a/tableauscraper/dashboard.py#L35 """ info_and_data = re.search(r"\d+;({.*})\d+;({.*})", file_contents, re.MULTILINE) data = json.loads(info_and_data.group(2)) presModelMap = data["secondaryInfo"]["presModelMap"] dataSegments = presModelMap["dataDictionary"]["presModelHolder"][ "genDataDictionaryPresModel"]["dataSegments"] full_data = utils.getDataFull(presModelMap, dataSegments) indices_info = utils.getIndicesInfo(presModelMap, "List") data_dict = utils.getData(full_data, indices_info) # Transpose columns to rows (tableau-scraping uses pandas, but we don't strictly need to do that) # i.e. {'a': [a1, a2, a3], 'b': [b1, b2, b3]} --> [{'a': a1, 'b': b1}, {'a': a2, 'b': b2}, {'a': a3, 'b': b3}] transposed_data = map( dict, itertools.starmap( zip, zip(itertools.repeat(data_dict.keys()), zip(*data_dict.values()))), ) return (tableau_item_to_parsed_site(entry) for entry in transposed_data)
def parse_tableau(file_contents): """ This is a weird blob containing *two* JSON encoded dictionaries. Each is preceded by its length in bytes, but using a regex (instead of counting) is simple enough. Follows the approach in tableau-scraping. See the links below: https://github.com/bertrandmartel/tableau-scraping/blob/9dba25af057ac29f921a75df374943060ab79b0a/tableauscraper/TableauScraper.py#L77-L84 https://github.com/bertrandmartel/tableau-scraping/blob/9dba25af057ac29f921a75df374943060ab79b0a/tableauscraper/dashboard.py#L35 """ info_and_data = re.search(r"\d+;({.*})\d+;({.*})", file_contents, re.MULTILINE) data = json.loads(info_and_data.group(2)) presModelMap = data["secondaryInfo"]["presModelMap"] dataSegments = presModelMap["dataDictionary"]["presModelHolder"][ "genDataDictionaryPresModel"]["dataSegments"] full_data = utils.getDataFull(presModelMap, dataSegments) indices_info = utils.getIndicesInfo(presModelMap, "Vaccination Sites") data_dict = utils.getData(full_data, indices_info) num_entries = len(data_dict["Site-value"]) # Transpose columns to rows (tableau-scraping uses pandas, but we don't strictly need to do that) # Rows are actually duplicated; some have map, some have website. entries = [] for i in range(0, num_entries, 2): main_data = {k: v[i] for (k, v) in data_dict.items()} extra_data = { "Dimension-value": data_dict["Dimension-value"][i + 1], "Value-alias": data_dict["Value-alias"][i + 1], } entries.append((main_data, extra_data)) return [tableau_item_to_parsed_site(entry) for entry in entries]
def parse_tableau(file_contents): """ This is a weird blob containing *two* JSON encoded dictionaries. Each is preceded by its length in bytes, but using a regex (instead of counting) is simple enough. Follows the approach in tableau-scraping. See the links below: https://github.com/bertrandmartel/tableau-scraping/blob/9dba25af057ac29f921a75df374943060ab79b0a/tableauscraper/TableauScraper.py#L77-L84 https://github.com/bertrandmartel/tableau-scraping/blob/9dba25af057ac29f921a75df374943060ab79b0a/tableauscraper/dashboard.py#L35 """ info_and_data = re.search(r"\d+;({.*})\d+;({.*})", file_contents, re.MULTILINE) data = json.loads(info_and_data.group(2)) presModelMap = data["secondaryInfo"]["presModelMap"] dataSegments = presModelMap["dataDictionary"]["presModelHolder"][ "genDataDictionaryPresModel"]["dataSegments"] full_data = utils.getDataFull(presModelMap, dataSegments) indices_info = utils.getIndicesInfo(presModelMap, "Vaccination Sites") data_dict = utils.getData(full_data, indices_info) # Transpose columns to rows (tableau-scraping uses pandas, but we don't strictly need to do that) # i.e. {'a': [a1, a2, a3], 'b': [b1, b2, b3]} --> [{'a': a1, 'b': b1}, {'a': a2, 'b': b2}, {'a': a3, 'b': b3}] transposed_data = map( dict, itertools.starmap( zip, zip(itertools.repeat(data_dict.keys()), zip(*data_dict.values()))), ) # Data contains at least one bad value; filter it out. See https://github.com/CAVaccineInventory/vaccine-feed-ingest/issues/621 filtered_transposed_data = (row for row in transposed_data if row["Site-value"] != "%null%") # Adjacent rows are actually duplicates; some have map, some have website. Combine into one. doubled_filtered_transposed_data = zip(filtered_transposed_data, filtered_transposed_data) return (tableau_item_to_parsed_site(entry) for entry in doubled_filtered_transposed_data)
def test_getDataFull(): presModel = utils.getPresModelVizData(data) dataFull = utils.getDataFull(presModel, {}) # check the extended list is not modified assert ( len( data["secondaryInfo"]["presModelMap"]["dataDictionary"]["presModelHolder"][ "genDataDictionaryPresModel" ]["dataSegments"]["0"]["dataColumns"][0]["dataValues"] ) == 6 ) assert ( len( data["secondaryInfo"]["presModelMap"]["dataDictionary"]["presModelHolder"][ "genDataDictionaryPresModel" ]["dataSegments"]["1"]["dataColumns"][0]["dataValues"] ) == 3 ) assert len(dataFull.keys()) == 2 assert "cstring" in dataFull assert "real" in dataFull assert len(dataFull["cstring"]) == 9 assert len(dataFull["real"]) == 5 assert dataFull["cstring"] == ["1", "2", "3", "4", "5", "6", "7", "8", "9"] assert dataFull["real"] == [1, 2, 3, 4, 5]
def test_getData(): presModel = utils.getPresModelVizData(data) dataFull = utils.getDataFull(presModel, {}) indicesInfo = utils.getIndicesInfo(presModel, "[WORKSHEET1]") frameData = utils.getData(dataFull, indicesInfo) assert len(frameData.keys()) == 2 assert "[FIELD1]-value" in frameData assert "[FIELD2]-alias" in frameData assert len(frameData["[FIELD1]-value"]) == 4 assert len(frameData["[FIELD2]-alias"]) == 4 assert frameData["[FIELD1]-value"] == ["2", "3", "4", "5"] assert frameData["[FIELD2]-alias"] == ["6", "7", "8", "9"]
def getWorksheet(TS, data, info, worksheet) -> TableauWorksheet: presModelMap = utils.getPresModelVizData(data) if presModelMap is None: presModelMap = utils.getPresModelVizInfo(info) indicesInfo = utils.getIndicesInfoStoryPoint(presModelMap, worksheet) if "dataDictionary" not in presModelMap: presModelMap = utils.getPresModelVizDataWithoutViz(data) dataFull = utils.getDataFull(presModelMap, TS.dataSegments) else: indicesInfo = utils.getIndicesInfo(presModelMap, worksheet) dataFull = utils.getDataFull(presModelMap, TS.dataSegments) frameData = utils.getData(dataFull, indicesInfo) df = pd.DataFrame.from_dict(frameData, orient="index").fillna(0).T return TableauWorksheet(scraper=TS, originalData=data, originalInfo=info, worksheetName=worksheet, dataFull=dataFull, dataFrame=df)
def getWorksheet(TS, data, info, worksheet) -> TableauWorksheet: presModelMap = data["secondaryInfo"]["presModelMap"] indicesInfo = utils.getIndicesInfo(presModelMap, worksheet) dataFull = utils.getDataFull(presModelMap) frameData = utils.getData(dataFull, indicesInfo) df = pd.DataFrame.from_dict(frameData, orient="index").fillna(0).T return TableauWorksheet( scraper=TS, originalData=data, originalInfo=info, worksheetName=worksheet, dataFrame=df, )
def get(TS, data, info, logger): worksheets = utils.selectWorksheet(data, logger, single=True) if len(worksheets) == 0: return TableauDashboard(scraper=TS, originalData=data, originalInfo=info, data=[]) selectedWorksheet = worksheets[0] presModel = utils.getPresModelVizData(data) result = utils.getIndicesInfo(presModel, selectedWorksheet, noSelectFilter=False) for idx, t in enumerate(result): logger.info(f"[{idx}] {t['fieldCaption']}") selected = input(f"select field by index : ") if (selected is None) or (selected == ""): raise (Exception("you must select at least one field")) field = result[int(selected)] logger.info(f"you have selected {field['fieldCaption']}") dataFull = utils.getDataFull(presModel) frameData = utils.getData(dataFull, [field]) frameDataKeys = list(frameData.keys()) if len(frameDataKeys) == 0: raise (Exception("no data extracted")) data = frameData[frameDataKeys[0]] for idx, t in enumerate(data): logger.info(f"[{idx}] {t}") selected = input(f"select value by index : ") if (selected is None) or (selected == ""): raise (Exception("you must select at least one value")) value = data[int(selected)] logger.info(f"you have selected {value}") r = api.select(TS, selectedWorksheet, [int(selected) + 1]) return dashboard.getCmdResponse(TS, r, logger)