def write_messages_to_tsv(files, bucket_name, metadata_file=None): """ Consume the subscription and write results to tsv manifest Args: files(dict): a dictionary of object files { "url": "test_url", "md5": "test_md5", "size": 1 } bucket_name(str): bucket for uploading the manifest to metadata_file(str): metadata file for merging """ metadata_info = {} # Default filenames without merging fields = ["url", "size", "md5"] # merge extra metadata info from file if metadata_file: with open(metadata_file, "rt") as csvfile: csvReader = csv.DictReader(csvfile, delimiter="\t") # Build a map with url as the key for row in csvReader: if "url" in row: metadata_info[row["url"]] = { k: v for k, v in row.items() if k != "url" } # do merging if possible, and update fields need_merge = False first_row_need_merge = None for row_num, fi in enumerate(files): if fi["url"] in metadata_info: need_merge = True first_row_need_merge = first_row_need_merge or row_num for k, v in metadata_info[fi["url"]].items(): fi[k] = v if files and need_merge: # add new fields [ fields.append(k) for k in files[first_row_need_merge].keys() if k not in ["url", "size", "md5"] ] if len(files) > 0: # part the url parts = urlparse(files[0]["url"]) # generate unique manifest output now = datetime.now() current_time = now.strftime("%m_%d_%y_%H:%M:%S") filename = "manifest_{}_{}.tsv".format(parts.netloc, current_time) # write list of object metadata to a file utils.write_tsv(filename, files, fields) # Upload the file to google bucket utils.upload_file(bucket_name, filename, filename) logging.info("DONE!!!")
def main(): folder = "./data/transomcs/" file = folder + "TransOMCS_full.txt" data = read_csv(file, delimiter="\t") confidences = {} for d in data: key = tuple_key(d) confidences[key] = float(d[3]) human_eval_file = folder + "human_evaluation_tuples.tsv" tuples = read_csv(human_eval_file, delimiter="\t", skip_header=True) updated_t = [{ "head_event": t[0], "relation": t[1], "tail_event": t[2] } for t in tuples if confidences[tuple_key(t)] >= 0.5] dropped = [{ "head_event": t[0], "relation": t[1], "tail_event": t[2] } for t in tuples if confidences[tuple_key(t)] < 0.5] output_file = folder + "human_evaluation_tuples_v2.tsv" write_tsv(output_file, updated_t) output_file = folder + "dropped_human_evaluation_tuples_v2.tsv" write_tsv(output_file, dropped)
def parse(): r = requests.get(URL) if not r.ok: print(f"Failed to fetch {URL}", file=sys.stderr) exit(1) r.close() db = json.loads(r.text) r.close() # Convert to ready made TSVs regions = defaultdict(list) for row in db: elt = [ int(row[X[c]]) if i > 0 else row[X[c]].split()[0] for i, c in enumerate(cols) ] regions[row[X["region"]]].append(elt) regions = dict(regions) # Sum all regions to obtain Italian data dates = defaultdict(lambda: np.zeros(len(cols) - 1)) for data in regions.values(): for datum in data: dates[datum[0]] += np.array(datum[1:]) regions["Italy"] = [] for date, counts in dates.items(): regions["Italy"].append([date] + [int(c) for c in counts]) for region, data in regions.items(): write_tsv(f"{LOC}/{region}.tsv", cols, data, "italy")
def parse(): r = requests.get(URL) if not r.ok: print(f"Failed to fetch {URL}", file=sys.stderr) exit(1) r.close() db = json.loads(r.text) r.close() # Convert to ready made TSVs regions = defaultdict(list) for row in db: date = str(row["date"]) date = f"{date[0:4]}-{date[4:6]}-{date[6:8]}" elt = [ date, row["positive"], row["death"], None, None, None ] regions[acronyms[row["state"]]].append(elt) regions = dict(regions) for region, data in regions.items(): write_tsv(f"{LOC}/{region}.tsv", cols, data, "unitedstates")
def parse(): r = requests.get(URL) if not r.ok: print(f"Failed to fetch {URL}", file=sys.stderr) exit(1) r.close() regions = defaultdict(list) fd = io.StringIO(r.text) rdr = csv.reader(fd) hdr = next(rdr) for row in rdr: date = row[0] canton = cantonal_codes[row[1]] regions[canton].append([date, to_int(row[2]), to_int(row[5]), to_int(row[6]), None, to_int(row[7])]) for region, data in regions.items(): if region != "Liechtenstein": write_tsv(f"{LOC}/{region}.tsv", cols, data, "switzerland") else: write_tsv(f"{LOC2}/{region}.tsv", cols, data, "switzerland")
def calculate_all_outliers(samples, thresholds, filtered_genelist_path, outliers_path, verbose=False): """For each sample calculate up and down outlier genes based on the thresholds. Then, apply expression & variance filters to mark 'noise' genes, and save as result file.""" print_v = print if verbose else lambda *a, **k: None print_v("Calculating outliers for all samples.") outliers = samples.apply(single_sample_outliers, args=[thresholds], axis=0) # Apply the expression & variance filters that we calculated in step 1 to mark filtered # genes as dropped due to Expression or Variance, or as Retained. # gene_filter_status is a pd.Series with values eg ["E"] gene_filter_status = utils.read_feather( filtered_genelist_path)["status"].apply(list) outliers = outliers.add(gene_filter_status, axis="index") outliers = outliers.applymap("".join) # Concatenate all values to string print_v("Writing outlier results to {}".format(outliers_path)) utils.write_tsv(outliers, outliers_path) return outliers
def parse(): cases = retrieve_case_data() cases = flatten(cases) write_tsv(f"{LOC}/World.tsv", cols, cases, "world")
from utils import get_paragraph_words from utils import make_transposition_pair_dataset from utils import write_tsv if __name__ == '__main__': paragraphs = get_paragraph_words(500, 20, 60, 3, tags=[[0, 1, 2]]) train_data, validation_data = make_transposition_pair_dataset( paragraphs, 128) write_tsv(train_data, 'train.tsv') write_tsv(validation_data, 'dev.tsv')
match_ids = get_match_ids(fn=match_ids_fn) data_fn = '../output/match_player_data.tsv' data = pd.read_csv(data_fn, sep='\t')f dupes = data[data['match_id_slot'].duplicated()] print(dupes) data[data['match_id'] == 5536512014] print(len(data)) print(len(data.drop_duplicates())) import utils match_kills_out_fn = '../output/match_player_data.tsv' utils.write_tsv(data=data.drop_duplicates(), fn=match_kills_out_fn) data_ids = set(data['match_id']) missing_mIDs = set(match_ids) - data_ids print(len(missing_mIDs)) match_ids.index(list(missing_mIDs)[0]) match_ids.index(list(missing_mIDs)[1]) # import csv # # cw = csv.writer(open('../input/missing_mIDs.csv', 'w')) # # cw.writerow(list(missing_mIDs)) # with open('../input/missing_mIDs.txt', 'w') as outfile: # # write(list(missing_mIDs)) # writer = csv.writer(outfile)
# Update df with first blood killers rm_no_kills.loc[rm_no_kills['match.id.slot'].isin(fb_kill_mID_slots), 'first.blood'] = 1 return rm_no_kills if __name__ == '__main__': json_in_fn = '../input/match_data.json' matches = utils.load_match_json(json_in_fn=json_in_fn) match_kills = get_player_data(matches=matches) match_kills_out_fn = '../output/match_player_data_v2.tsv' utils.write_tsv(data=match_kills, fn=match_kills_out_fn) print("Player level data for each match:") print(f" - Saved {len(matches)} matches to output folder") # Junk: # print(matches[0]['start_time']) # matches = json.loads(json_file) # print(matches[0]['match_id']) # print(matches[1]['match_id']) # matches[0].keys() # type(matches) # len(matches)