Exemplo n.º 1
0
def download_edictos(
    data_dir=f"{os.environ['HOME']}/data/corteconstitucional/edictos", ):
    """
    needs to be run several times, some times it claims that it cannot find downloaded pdfs,
    :param data_dir:
    :return:
    """
    url = "https://www.corteconstitucional.gov.co/secretaria/edictos/"
    download_dir = f"{data_dir}/downloads"
    os.makedirs(download_dir, exist_ok=True)

    wd = build_chrome_driver(download_dir, headless=True)
    hrefs = get_hrefs(url, wd)

    old_file = f"{data_dir}/documents.jsonl"
    found_existing_documents = os.path.isfile(old_file)
    if found_existing_documents:
        new_file = old_file.split(".jsonl")[0] + "_updated.jsonl"
        old_docs = list(data_io.read_jsonl(old_file))
    else:
        old_docs = []
        new_file = old_file
    try:
        data_io.write_jsonl(
            new_file, generate_raw_docs(old_docs, hrefs, wd, download_dir))
    except Exception as e:
        traceback.print_exc()
        print("shit happened")
    finally:
        if found_existing_documents:
            shutil.move(new_file, old_file)
Exemplo n.º 2
0
def take_proceso_table_screenshots(diffs: Iterable):
    base_url = "https://www.corteconstitucional.gov.co/secretaria/"
    os.makedirs(data_path, exist_ok=True)
    download_path = f"{data_path}/downloads"
    wd = build_chrome_driver(download_path,
                             headless=True,
                             window_size=(1080, 1080))

    for d in tqdm(diffs):
        search_id = f"{d['tilo'][PROCESO]}-{d['tilo'][EXPEDIENTE]}"
        try:
            png_file = f"{data_path}/{search_id}.png"
            if not os.path.isfile(png_file):
                print(f"screen-shotting: {search_id}")
                fire_search(base_url, search_id, wd)
                prepare_for_screenshot(wd)
                wd.save_screenshot(png_file)
            yield d, png_file
        except BaseException as e:
            print(f"{search_id} f****d it up!")
            yield d, None
Exemplo n.º 3
0
def scrape_proceso_tables(search_ids: List):
    base_url = "https://www.corteconstitucional.gov.co/secretaria/"
    data_path = f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables"
    os.makedirs(data_path, exist_ok=True)
    download_path = f"{data_path}/downloads"
    wd = build_chrome_driver(download_path, headless=True)

    ids_files = ((eid, f"{data_path}/{eid}.json") for eid in search_ids)
    to_be_scraped = [(eid, file) for eid, file in ids_files
                     if not os.path.isfile(file)]
    print(f"already got {len(search_ids)-len(to_be_scraped)}")

    for search_id, file in tqdm(to_be_scraped):
        try:
            fire_search(base_url, search_id, wd)
            datum = dump_proceso_table(wd)
            datum["id"] = search_id
            data_io.write_json(file, datum)
        except BaseException as e:
            # traceback.print_stack()
            # raise e
            data_io.write_lines(f"{data_path}/could_not_scrape.txt",
                                [search_id])
            print(f"{search_id} f****d it up!")
Exemplo n.º 4
0
        docs = []
        for hit in get_hits(source):
            docs.append(build_doc(wd, hit))
            yield True

        data_io.write_jsonl(docs_file, docs)

        for file_name in os.listdir(download_path):
            shutil.move(os.path.join(download_path, file_name),
                        bucket_downloads)


if __name__ == "__main__":

    base_url = "http://relatoria.consejodeestado.gov.co"
    data_path = f"{os.environ['HOME']}/data/relatoria_consejodeestado"
    download_path = f"{data_path}/downloads"
    wd = build_chrome_driver(download_path, headless=False)

    dates = list(
        reversed(
            pandas.date_range("2010", "2020", freq="1M") + timedelta(days=1)))
    jobs = [(d1.date(), (d2 - timedelta(days=1)).date())
            for d1, d2 in zip(dates[:-1], dates[1:])]
    os.listdir(data_path)
    g = (_ for s, e in jobs for _ in scrape_date_range(s, e, wd, data_path))

    for _ in tqdm(g):
        pass
Exemplo n.º 5
0
                "holiday_type": holiday_type,
            }


def build_date(th, year):
    day_s, month = th[0].text.split(" ")
    day = int(day_s)
    date_s = f"{month} {day} {year}"
    date = datetime.strptime(date_s, "%b %d %Y")
    date_formatted = date.strftime("%m/%d/%Y")
    return date_formatted


def get_holidays(wd, year):
    url = f"https://www.timeanddate.com/holidays/colombia/{year}?hol=1"
    wd.get(url)
    soup = BeautifulSoup(wd.page_source, features="html.parser")
    table = soup.find("section", class_="table-data__table")
    return list(generate(table, year))


if __name__ == "__main__":
    wd = build_chrome_driver("/tmp/", headless=True)
    first_year = 2015
    last_year = 2020
    data_io.write_jsonl(
        "holidays.jsonl",
        (d for year in tqdm(range(first_year, last_year + 1)) for d in get_holidays(wd, year)),
    )
    wd.close()