def download_edictos( data_dir=f"{os.environ['HOME']}/data/corteconstitucional/edictos", ): """ needs to be run several times, some times it claims that it cannot find downloaded pdfs, :param data_dir: :return: """ url = "https://www.corteconstitucional.gov.co/secretaria/edictos/" download_dir = f"{data_dir}/downloads" os.makedirs(download_dir, exist_ok=True) wd = build_chrome_driver(download_dir, headless=True) hrefs = get_hrefs(url, wd) old_file = f"{data_dir}/documents.jsonl" found_existing_documents = os.path.isfile(old_file) if found_existing_documents: new_file = old_file.split(".jsonl")[0] + "_updated.jsonl" old_docs = list(data_io.read_jsonl(old_file)) else: old_docs = [] new_file = old_file try: data_io.write_jsonl( new_file, generate_raw_docs(old_docs, hrefs, wd, download_dir)) except Exception as e: traceback.print_exc() print("shit happened") finally: if found_existing_documents: shutil.move(new_file, old_file)
def parse_edictos(save=True) -> List: data = list(tqdm(generate_edictos())) # data = [Edicto(**d) for d in data_io.read_jsonl("edictos.jsonl")] unique_data = list(set(data)) print(len(data)) print(f"unique: {len(unique_data)}") if save: data_io.write_jsonl("edictos.jsonl", (asdict(d) for d in data)) return unique_data
def read_or_process_data(path, limit: int = None): tmp_train_data_file = "/tmp/train_data.jsonl.gz" tmp_meta_file = "/tmp/meta_data.jsonl" if not os.path.exists(tmp_train_data_file): train_df, features = read_csvs_build_features(path) types = { name: typ for name, typ in train_df.dtypes.to_dict().items() if name in features } numerical_features = [ name for name, typ in types.items() if typ == float or typ == int ] # print(numerical_features) categorical_features = [ name for name, typ in types.items() if typ == str ] # print(categorical_features) input_dim = len(numerical_features) + len(categorical_features) def dataframe_to_dicts(df): data = [row[1].to_dict() for row in df.iterrows()] [d.__delitem__("Date") for d in data] return data train_data_dicts = dataframe_to_dicts(train_df) y_train_list = train_df["target"].tolist() [ d.__setitem__("target", t) for d, t in zip(train_data_dicts, y_train_list) ] data_io.write_jsonl(tmp_train_data_file, train_data_dicts) data_io.write_jsonl( tmp_meta_file, [{ "numerical_features": numerical_features, "categorical_features": categorical_features, }], ) else: print("loading already processed data") train_data_dicts = list( data_io.read_jsonl(tmp_train_data_file, limit=limit)) y_train_list = [d["target"] for d in train_data_dicts] meta = list(data_io.read_jsonl(tmp_meta_file))[0] numerical_features = meta["numerical_features"] categorical_features = meta["categorical_features"] # features = categorical_features + numerical_features return train_data_dicts, categorical_features, numerical_features
def populate_es_parallel_bulk( es, files, es_index_name, es_type, limit=None, num_processes=4, chunk_size=500 ): dicts_g = (d for file in files for d in read_jsonl(file, limit=limit)) actions_g = (build_es_action(d, es_index_name, es_type) for d in dicts_g) results_g = helpers.parallel_bulk( es, actions_g, thread_count=num_processes, queue_size=num_processes, chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, ) failed_g = ( pop_exception(d) for ok, d in tqdm(results_g) if not ok and d.get("create", {}).get("status", 200) != 409 ) data_io.write_jsonl("failed.jsonl", failed_g)
def extract_from_edicto(source, string, edicto_num: int, edicto_year): edicto_date = parse_edicto_date(string) if edicto_date is None: data_io.write_jsonl(DEBUG_EDICTO_DATE, [{ "source": source, "text": string }], mode="ab") return [] spans = [get_sentencia_span(m) for m in sentencia_pattern.finditer(string)] edictos = [] for k, (start, end, sentencia) in enumerate(spans): next_start, _, _ = (spans[k + 1] if k + 1 < len(spans) else (len(string), None, None)) _, previous_end, _ = spans[k - 1] if k > 0 else (None, 0, None) behind_sentencia = string[end:next_start] expedientes = extract_expedientes(behind_sentencia) if len(expedientes) > 0: before_sentencia = string[previous_end:start] data_io.write_lines(DEBUG_BEFORE_SENTENCIA, [before_sentencia.replace("\n", "€")], mode="ab") date = extract_date(before_sentencia) if date is not None: edictos.append( Edicto( sentencia, date, edicto_date, edicto_year, expedientes, source, edicto_num, )) if len(edictos) != 1: data_io.write_jsonl(DEBUG_NO_EDICTO, [{ "source": source, "string": string }], "ab") return edictos
def prepare_manifest(corpora_dir="/content/corpora", limit=None): manifest = "manifest.jsonl" manifests = list(Path(corpora_dir).rglob("manifest.jsonl.gz")) limit = round(limit / len(manifests)) if limit is not None else None def get_file_name(f): if "/" in f: o = f.split('/')[-1] else: o = f return o g = ( { "audio_filepath": f"{str(f).replace(f.name, '')}/mp3/{get_file_name(d['audio_file'])}", #TODO(tilo): just hack for TEDLIUM! "duration": d["duration"], "text": d["text"], } for f in manifests for d in data_io.read_jsonl(str(f), limit=limit)) data_io.write_jsonl(manifest, g) return manifest
def consumer(file): print("%s is doing %s; limit: %d" % (multiprocessing.current_process(), file, limit)) dicts_g = (d for d in data_io.read_jsonl(file, limit=limit)) actions_g = (build_es_action(d, es_index_name, es_type, op_type="index") for d in dicts_g) results_g = helpers.streaming_bulk( es_client, actions_g, chunk_size=chunk_size, yield_ok=True, raise_on_error=False, raise_on_exception=False, ) failed_g = (pop_exception(d) for ok, d in results_g if not ok) data_io.write_jsonl( "%s_failed.jsonl" % multiprocessing.current_process(), failed_g)
def populate_es_streaming_bulk( es_client: Elasticsearch, dicts: Iterable[Dict], es_index_name: str, es_type: str, chunk_size: int = 500, ): def pop_exception(d): d["index"].pop("exception") return d es_actions_g = (build_es_action(d, index_name=es_index_name, es_type=es_type) for d in dicts) results_g = helpers.streaming_bulk( es_client, es_actions_g, chunk_size=chunk_size, yield_ok=True, raise_on_error=True, ) failed_g = (pop_exception(d) for ok, d in tqdm(results_g) if not ok) data_io.write_jsonl("failed.jsonl", failed_g)
def scrape_date_range(date_from: date, date_to: date, wd: WebDriver, data_path): year_from = date_from.year month_from = date_from.month day_from = date_from.day year_to = date_to.year month_to = date_to.month day_to = date_to.day date_range = f"{year_from}-{month_from}-{day_from}_{year_to}-{month_to}-{day_to}" for page in itertools.count(start=0): bucket_path = f"{data_path}/{date_range}_{page}" url = f"{base_url}/?de=0&se=0&ac=0&ca=0&rs={year_from}%2F{fmt(month_from)}%2F{fmt(day_from)}&re={year_to}%2F{fmt(month_to)}%2F{fmt(day_to)}&st=0&pg={page}" bucket_downloads = f"{bucket_path}/downloads" os.makedirs(bucket_downloads, exist_ok=True) docs_file = f"{bucket_path}/docs.jsonl.gz" if os.path.isfile(docs_file): continue wd.get(url) source = wd.page_source if "No hay resultados para mostrar en este momento" in source: break docs = [] for hit in get_hits(source): docs.append(build_doc(wd, hit)) yield True data_io.write_jsonl(docs_file, docs) for file_name in os.listdir(download_path): shutil.move(os.path.join(download_path, file_name), bucket_downloads)
"lte": "%s" } } } ] } }, "size": 20, "_source": { "excludes": "content" } } """ % (d["file_number"], d["type"], d['date'], d['date']) if __name__ == "__main__": file = home + "/data/cases.json.gz" es_client = build_es_client(host="guntherhamachi") TYPE = "decision" INDEX = "juris" def not_found_generator(): for d in tqdm(data_io.read_jsonl(file)): body = build_body(d) r = es_client.search(index=INDEX, body=body, size=3) if r['hits']['total']['value'] < 1: yield d data_io.write_jsonl('failed_to_find.jsonl', not_found_generator())
def process(d): # d = {k:d[k] for k in ['date','aktenzeichen','zitiervorschlag','entscheidungsdatum','content']} return d if __name__ == "__main__": es_client = build_es_client() TYPE = "decision" INDEX = "juris" # fmt: off fields = [ "Orientierungssatz", "Gründe", "Tenor", "Leitsatz", "Sonstiger Orientierungssatz", "Abweichende Meinung", "Entscheidungsgründe", "Tatbestand", "Sonstiger Kurztext", "Sonstiger Langtext" ] # fmt: on body = {"query": {"multi_match": {"query": "185 StGB", "fields": fields}}} hits_g = elastic_scan( es_client, index=INDEX, query=body, batch_size=100, ) docs = (process(d["_source"]) for d in tqdm(hits_g)) # docs = (process(d) for d in data_io.read_jsonl('BverfG_juris.jsonl.gz')) data_io.write_jsonl("decisions_185_StGB.jsonl.gz", docs)
for f in FIELDS } schema = Schema( aktenzeichen=ID(stored=True), **fields, ) file = "BverfG.jsonl.gz" data = ({ process_field_name(k): v for k, v in d.items() if k in FIELDS + ["aktenzeichen"] } for d in data_io.read_jsonl(file)) return schema, data if __name__ == "__main__": INDEX_DIR = "bverfg_index" if not os.path.isdir(INDEX_DIR): schema, data = build_schema_and_corpus() build_index(data, schema, index_dir=INDEX_DIR) print("done building corpus") ix = index.open_dir(INDEX_DIR) with ix.searcher() as searcher: qp = MultifieldParser(FIELDS, schema=ix.schema) q = qp.parse("185 StGB") results = searcher.search(q, limit=None) data_io.write_jsonl("result.jsonl", (r.fields() for r in results))
""" # First line is the title split = re.split(r'\|', content[0].rstrip(), maxsplit=2) doc_id = int(split[0]) stable_id = get_stable_id(doc_id) doc_text = split[2] # Second line is the abstract # Assume these are newline-separated; is this true? # Note: some articles do not have abstracts, however they still have this line doc_text += ' ' + re.split(r'\|', content[1].rstrip(), maxsplit=2)[2] annos = parse_annotations(content) return {'PMID':doc_id, 'stable_id':stable_id, 'text':doc_text, 'annos':annos} if __name__ == '__main__': # file_path = os.environ["HOME"]+'/code/NLP/IE/pubtator/download/bioconcepts2pubtatorcentral.offset.sample' file_path = os.environ["HOME"]+'/pubtator/download/bioconcepts2pubtatorcentral.offset.gz' g = (pubtator_parser(content) for content in doc_generator(file_path,limit=100)) data_io.write_jsonl('./parsed.jsonl',g)
tokenize = lambda s: s.split(" ") order = 3 start = time() aligned_ngrams = (db.from_sequence( refs_hyps, npartitions=4 * 4).map(lambda rh: calc_aligned_ngram_tuples( tokenize(rh[0]), tokenize(rh[1]), order)).flatten().map( lambda rh: (" ".join(rh[0]), " ".join(rh[1])))) def error_rate(ref, hyp_counts: Counter): overall_num_erros = sum(v for k, v in hyp_counts.items() if ref != k) num_correct = hyp_counts[ref] return overall_num_erros / (1 + num_correct) result = aligned_ngrams.foldby( lambda rh: rh[0], lambda total, x: total + Counter([x[1]]), initial=Counter(), combine=lambda x, y: x + y, ).topk(1000, lambda kc: error_rate(*kc)) counts = result.map(lambda kc: (kc[0], dict(kc[1].most_common(5)))).compute() data_io.write_jsonl("ngram_counts.jsonl", counts) # pprint(result.compute()) #topk(10,key=lambda ) # aligned_ngrams.filter(lambda rh: rh[0] != rh[1]).map(json.dumps).to_textfiles( # "processed/erroneous_ngrams_*.jsonl.gz" # ) print(f"took: {time()-start} seconds")
def multi_eval(algos,LOGS_DIR, num_eval=5, num_workers=12): """ evaluating 12 jobs with 1 workers took: 415.78 seconds evaluating 12 jobs with 3 workers took: 154.78 seconds evaluating 12 jobs with 6 workers took: 91.88 seconds evaluating 12 jobs with 12 workers took: 70.68 seconds on gunther one gets cuda out of mem error with num_workers>12 """ task = PlatoScoreTask(LOGS_DIR = LOGS_DIR) jobs = [ Experiment( job_id=get_id(), name=build_name(algo, error_sim, two_slots), config=build_config(algo, error_sim=error_sim, two_slots=two_slots), train_dialogues=td, eval_dialogues=1000, num_warmup_dialogues=warmupd ) for _ in range(num_eval) for error_sim in [False,True] for two_slots in [False,True] for td in [40000] for warmupd in [4000] for algo in algos ] start = time() outfile = LOGS_DIR+"/results.jsonl" mode = "wb" if os.path.isdir(LOGS_DIR): results = list(data_io.read_jsonl(outfile)) done_ids = [e['job_id'] for e in results] jobs = [e for e in jobs if e.job_id not in done_ids] print('only got %d jobs to do'%len(jobs)) print([e.job_id for e in jobs]) mode = "ab" else: os.makedirs(LOGS_DIR) if num_workers > 0: num_workers = min(len(jobs),num_workers) with WorkerPool(processes=num_workers, task=task, daemons=False) as p: processed_jobs = p.process_unordered(jobs) data_io.write_jsonl(outfile, processed_jobs, mode=mode) else: with task as t: processed_jobs = [t(job) for job in jobs] data_io.write_jsonl(outfile, processed_jobs, mode=mode) scoring_runs = list(data_io.read_jsonl(outfile)) plot_results(scoring_runs,LOGS_DIR) print( "evaluating %d jobs with %d workers took: %0.2f seconds" % (len(jobs), num_workers, time() - start) )
"train": train, "test": test }.items() } data = [[token for token, tag in datum] for datum in corpus] idx = select_fun(tagger, data) return predictions, idx if __name__ == "__main__": import os data_supplier = partial(read_conll03_en, path=os.environ["HOME"] + "/data/IE/seqtag_data") dataset = data_supplier() task = ActiveLearnSpacyCrfSeqTagScoreTask(params=Params(c1=0.5, c2=0.0, max_it=100), data_supplier=data_supplier) num_folds = 5 select_funs = [select_by_max_entropy, select_random] jobs = [Job(f) for _ in range(num_folds) for f in select_funs] num_workers = min(multiprocessing.cpu_count() - 1, len(jobs)) start = time() scores = calc_scores(task, jobs, num_workers) duration = time() - start print("%d jobs with %d workers took: %0.2f seconds" % (len(jobs), num_workers, duration)) data_io.write_jsonl("scores.jsonl", scores)
from util import data_io from util.util_methods import merge_dicts from corteconstitucional.parse_edictos import Edicto from corteconstitucional.parse_proceso_tables import parse_table def merge_edictos_proceso_tables( edictos: List, data_path=f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables" ) -> List: raw_data = list( data_io.read_json(str(file)) for file in tqdm(Path(data_path).glob("*.json"))) print("parse tables") table_data = (parse_table(d) for d in raw_data) exp2table = {t.expediente: t for t in tqdm(table_data)} g = (merge_dicts([ asdict(e), { "tables": [asdict(exp2table[exp]) for exp in e.expedientes] } ]) for e in edictos) merged_data = list(g) return merged_data if __name__ == "__main__": edictos = [Edicto(**d) for d in data_io.read_jsonl("edictos.jsonl")] merged_data = merge_edictos_proceso_tables(edictos) data_io.write_jsonl("/tmp/merged_edictos2tables.jsonl", merged_data)
from password_generator import PasswordGenerator from util import data_io if __name__ == '__main__': pwo = PasswordGenerator() pwo.minlen = 9 pwo.maxlen = 9 def build_user(eid, user_name): return {'name': user_name, 'password': pwo.generate(), 'id': eid} data_io.write_jsonl('annotators.jsonl', (build_user(k + 2, user_name) for k, user_name in enumerate(['Salar', 'Vinicius', 'Tarcisio', 'Tilo'])))
if limit is not None and counter > limit: break yield hit def process(d): d = {k:d[k] for k in ['date','aktenzeichen','zitiervorschlag','entscheidungsdatum','content']} return d if __name__ == '__main__': es_client = build_es_client(host="gunther") TYPE = "decision" INDEX = "juris" body = { "query": { "match_phrase": { "zitiervorschlag": "BVerfG" } } } hits_g = elastic_scan( es_client, index=INDEX, query=body, batch_size=100, # limit=100, ) docs = (process(d['_source']) for d in tqdm(hits_g)) # docs = (process(d) for d in data_io.read_jsonl('BverfG_juris.jsonl.gz')) data_io.write_jsonl('BverfG_juris_content.jsonl.gz',docs)
"holiday_type": holiday_type, } def build_date(th, year): day_s, month = th[0].text.split(" ") day = int(day_s) date_s = f"{month} {day} {year}" date = datetime.strptime(date_s, "%b %d %Y") date_formatted = date.strftime("%m/%d/%Y") return date_formatted def get_holidays(wd, year): url = f"https://www.timeanddate.com/holidays/colombia/{year}?hol=1" wd.get(url) soup = BeautifulSoup(wd.page_source, features="html.parser") table = soup.find("section", class_="table-data__table") return list(generate(table, year)) if __name__ == "__main__": wd = build_chrome_driver("/tmp/", headless=True) first_year = 2015 last_year = 2020 data_io.write_jsonl( "holidays.jsonl", (d for year in tqdm(range(first_year, last_year + 1)) for d in get_holidays(wd, year)), ) wd.close()
count_rows, ) from tqdm import tqdm from util import data_io, util_methods if __name__ == "__main__": # file = "sqlite:////home/tilo/tilo-tub/code/DrQA/data/wikipedia/docs.db" file = "sqlite:////home/tilo/code/DrQA/data/wikipedia/docs.db" base, engine = get_sqlalchemy_base_engine(file) tables = get_tables_by_reflection(base.metadata, engine) docs_table = tables["documents"] with engine.connect() as conn: num_rows = count_rows(engine, docs_table) num_batches = 8 batch_size = num_rows // num_batches + 1 it = iter(tqdm(get_rows(conn, select([docs_table])))) def row_gen(): for k in range(batch_size): try: d = next(it) except StopIteration as e: # there is no next element in iterator break yield d for batch_idx in range(num_batches): data_io.write_jsonl(f"drqa_wikipedia_{batch_idx}.jsonl.gz", row_gen(), mode="ab")