def calc_distances(tati_data: List[Dict], tilo_data: List[Dict]) -> Dict[str, Dict]: distances = defaultdict(dict) distances_json = "/tmp/distances.json" if not os.path.isfile(distances_json): for i, tilo in tqdm(enumerate(tilo_data)): for ii, tati in enumerate(tati_data): distances[str(i)][str(ii)] = Levenshtein.distance( str(tilo), str(tati)) data_io.write_json(distances_json, distances) else: distances = data_io.read_json(distances_json) return distances
def read_lines_from_files( self, path, mode="b", encoding="utf-8", limit=sys.maxsize ): c = 0 for file in os.listdir(path): if self.state.get(file, 0) == "all": continue for line_idx, line in enumerate( data_io.read_lines(path + "/" + file, mode, encoding) ): c += 1 if line_idx < self.state.get(file, 0): continue if c > limit: break yield line self.state[file] = line_idx if c % self.write_interval == 0: data_io.write_json(self.state_file, self.state) self.state[file] = "all"
def calc_write_learning_curve(exp: Experiment, max_num_workers=40): num_workers = min(min(max_num_workers, multiprocessing.cpu_count() - 1), exp.num_folds) name = exp.name print("got %d evaluations to calculate" % len(exp.jobs)) results_path = results_folder + "/" + name os.makedirs(results_path, exist_ok=True) start = time() scores = calc_scores(exp.score_task, [split for train_size, split in exp.jobs], n_jobs=num_workers) duration = time() - start meta_data = { "duration": duration, "num-workers": num_workers, "experiment": str(exp), } data_io.write_json(results_path + "/meta_datas.json", meta_data) print("calculating learning-curve for %s took %0.2f seconds" % (name, duration)) pprint(scores) results = groupandsort_by_first( zip([train_size for train_size, _ in exp.jobs], scores)) data_io.write_json(results_path + "/learning_curve.json", results) trainsize_to_mean_std_scores = { train_size: tuple_2_dict(calc_mean_and_std(m)) for train_size, m in results.items() } data_io.write_json( results_path + "/learning_curve_meanstd.json", trainsize_to_mean_std_scores, )
def batch_inference(args: argparse.Namespace): torch.set_grad_enabled(False) if args.asr_model.endswith(".nemo"): print(f"Using local ASR model from {args.asr_model}") asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model) else: print(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) manifest = prepare_manifest(args.corpora_dir, args.limit) asr_model.setup_test_data( test_data_config={ "sample_rate": 16000, "manifest_filepath": manifest, "labels": asr_model.decoder.vocabulary, "batch_size": args.batch_size, "normalize_transcripts": args.normalize_text, }) refs_hyps = list(tqdm(generate_ref_hyps(asr_model, args.search, args.arpa))) references, hypotheses = [list(k) for k in zip(*refs_hyps)] os.makedirs(args.results_dir, exist_ok=True) data_io.write_lines(f"{args.results_dir}/refs.txt.gz", references) data_io.write_lines(f"{args.results_dir}/hyps.txt.gz", hypotheses) wer_value = word_error_rate(hypotheses=hypotheses, references=references) sys.stdout.flush() stats = { "wer": wer_value, "args": args.__dict__, } data_io.write_json(f"{args.results_dir}/stats.txt", stats) print(f"Got WER of {wer_value}") return stats
def scrape_proceso_tables(search_ids: List): base_url = "https://www.corteconstitucional.gov.co/secretaria/" data_path = f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables" os.makedirs(data_path, exist_ok=True) download_path = f"{data_path}/downloads" wd = build_chrome_driver(download_path, headless=True) ids_files = ((eid, f"{data_path}/{eid}.json") for eid in search_ids) to_be_scraped = [(eid, file) for eid, file in ids_files if not os.path.isfile(file)] print(f"already got {len(search_ids)-len(to_be_scraped)}") for search_id, file in tqdm(to_be_scraped): try: fire_search(base_url, search_id, wd) datum = dump_proceso_table(wd) datum["id"] = search_id data_io.write_json(file, datum) except BaseException as e: # traceback.print_stack() # raise e data_io.write_lines(f"{data_path}/could_not_scrape.txt", [search_id]) print(f"{search_id} f****d it up!")
model.eval() pred_scores = [] dev_data = [] for mini_batch in eval_loader: dev_data.extend(mini_batch) e1, r = convert_tuples_to_tensors(mini_batch) scores = model.forward(e1.to(device), r.to(device)).cpu() pred_scores.append(scores) dev_scores = torch.cat(pred_scores) return hits_and_ranks(dev_data, dev_scores, data.dataset2trees) pbar = tqdm(range(100)) model.to(device) for epoch in pbar: model.train() epoch_loss = numpy.mean([ train_one_batch(model, optimizer, raw_batch) for raw_batch in train_loader ]) if epoch % 10 == 0: mrr = run_evaluation(eval_loader, model)["mrr"] named_params = {n: v for n, v in model.named_parameters()} data_io.write_json('ent2id.json', data.ent2id) torch.save(named_params['entity_embeddings.weight'].data, "entity_embeddings.pt") pbar.set_description( "Epoch: {}; mean-loss: {:.4f}; MRR: {:.3f}".format( epoch + 1, epoch_loss, mrr)) ''' Epoch: 100; mean-loss: 0.0891; MRR: 0.947: 100%|██████████| 100/100 [02:10<00:00, 1.30s/it] '''
def __exit__(self, exc_type, exc_val, exc_tb): data_io.write_json(self.state_file, self.state) pprint(self.state)
def dump_to_disk_process_subtitles(n_clicks, video_file, texts, titles, model_name): print(f"video_file:{video_file}") assert all((isinstance(s, str) for s in texts)) if n_clicks > 0 and video_file is not None: data = { title: TranslatedTranscript(title, k, text) for k, (title, text) in enumerate(zip(titles, texts)) } data_io.write_json( build_json_name(video_file, model_name), {name: asdict(v) for name, v in data.items()}, ) named_blocks = segment_transcript_to_subtitle_blocks( get_letters_csv(video_file, model_name), list(data.values())) subtitles = dbc.Row( [ dash_table.DataTable( columns=[{ "id": cn, "name": cn } for cn in ["start-time"] + titles], data=[{ **{ name: "".join([l.letter for l in b[name]]) for name in titles }, **{ "start-time": str( timedelta(milliseconds=round(1000 * b[titles[0]][0].index / TARGET_SAMPLE_RATE))) }, } for b in named_blocks], style_table={ "height": 200 * len(titles), "overflowY": "scroll", "width": "100%", "font-size": 9, }, style_cell={ # "overflow": "hidden", # "textOverflow": "ellipsis", # "maxWidth": 0, "textAlign": "left", "height": "auto", }, ), ], style={"width": "100%"}, ) return ( "content-of-this-string-does-not-matter", [subtitles], json.dumps([ asdict(SubtitleBlock.from_dict_letters(dl)) for dl in named_blocks ]), ) else: print(f"DEBUG: prevented to update dump_to_disk_process_subtitles") raise PreventUpdate
data_supplier, splits = build_data_supplier_splits_trainset_only( raw_data_supplier, num_folds, 0.1 ) start = time() task = SpacyCrfScorer( params=Params(c1=0.5, c2=0.0, max_it=2), data_supplier=data_supplier ) num_workers = 0 # min(multiprocessing.cpu_count() - 1, num_folds) m_scores_std_scores = calc_mean_std_scores(task, splits, n_jobs=num_workers) print( "spacy+crfsuite-tagger %d folds %d workers took: %0.2f seconds" % (num_folds, num_workers, time() - start) ) pprint(m_scores_std_scores) data_io.write_json("spacy-crf-scores.json", m_scores_std_scores) """ ############################################################################# on x1-carbon scierc-data spacy+crfsuite-tagger 3 folds-PARALLEL took: 74.86 seconds {'m_scores': {'dev': {'f1-macro': 0.8822625032484681, 'f1-micro': 0.9528343173272004, 'f1-spanwise': 0.8470436086284675}, 'test': {'f1-macro': 0.5742946309433821, 'f1-micro': 0.832899550463387, 'f1-spanwise': 0.5345123493111902}, 'train': {'f1-macro': 0.8844589822247658, 'f1-micro': 0.9522832740014087, 'f1-spanwise': 0.842115934181045}},