Exemplo n.º 1
0
def list_pdf_v3(configs,
                save_dir,
                debug=False,
                important=False,
                try_overwite=False,
                name_in_url=True,
                add_date=False,
                extract_name=False,
                no_overwrite=False):  # try_overwite is for get_files
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    html_page = requests.get(configs.webpage).text
    soup = BeautifulSoup(html_page, "html.parser")

    try:
        os.remove("url_name.txt")
    except FileNotFoundError:
        pass
    extract_info(soup, configs, extract_name=extract_name)

    if important == False:
        non_important = configs.non_important
        with open("url_name.txt", "r") as og_file, open("2url_name.txt",
                                                        "w") as new_file:
            for line in og_file:
                if not any(non_important in line.lower()
                           for non_important in non_important):
                    new_file.write(line)
    else:
        try:
            important = configs.important
        except AttributeError:
            print("")
            print("Important is still named `non_important`")
            print("")
            important = configs.non_important

        with open("url_name.txt", "r") as og_file, open("2url_name.txt",
                                                        "w") as new_file:
            for line in og_file:
                if any(important in line.lower() for important in important):
                    new_file.write(line)
                    print(line)

    if debug != True:
        try:
            os.remove("url_name.txt")
        except FileNotFoundError:
            pass
        os.rename("2url_name.txt", "url_name.txt")

    get_files(
        save_dir,
        configs.sleep_time,
        debug=debug,
        try_overwite=try_overwite,
        name_in_url=name_in_url,
        add_date=add_date,
    )
Exemplo n.º 2
0
def list_pdf_v2(
    configs,
    save_dir,
    name_in_url=True,
    extract_name=False,
    add_date=False,
    try_overwite=False,
    no_overwrite=False,
):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    html_page = requests.get(configs.webpage).text
    soup = BeautifulSoup(html_page, "html.parser")

    url_name = []

    try:
        os.remove("url_name.txt")
    except FileNotFoundError:
        pass
    extract_info(soup, configs, extract_name=extract_name)
    get_files(save_dir, configs.sleep_time, name_in_url=name_in_url, add_date=add_date)
Exemplo n.º 3
0
soup = BeautifulSoup(html_page, "html.parser")
# print(soup)

url_name = []


def extract_info(soup):
    for link in soup.findAll("a"):
        if link.get("href") is None:
            continue
        if not link["href"].startswith(web_path):
            continue
        print(link.get("href"))
        url = str(link["href"])
        name = url[url.rindex("/"):]
        # name = name[:name.rindex('.')]
        with open("url_name.txt", "a") as output:
            output.write(url + ", " + name.strip("/") + "\n")
            # Uncomment following line if domain is not in href, and comment out line above
            # output.write(domain + web_path + ", " + name.strip("/") + "\n")
    print("Done")


try:
    os.remove("url_name.txt")
except FileNotFoundError:
    pass
extract_info(soup)
get_files(save_dir, sleep_time)
# import etl.py
Exemplo n.º 4
0
dumped = json.dumps(parsed, indent=4, sort_keys=True)

with open("response.json", "w") as output:
    output.write(dumped)
output.close()

try:
    os.remove("url_name.txt")
except FileNotFoundError:
    pass

with open("response.json", "r") as output:
    data = json.load(output)
    with open("url_name.txt", "w+") as outfile:
        for i in range(len(data) + 1):
            media_dict = data["media"][i]
            outfile.write(
                str(media_dict["frontend_url"]) + ", " +
                str(media_dict["name"]) + "\n")

    outfile.close()

try:
    os.remove("response.json")
except FileNotFoundError:
    pass

get_files(save_dir, configs.sleep_time)

# import etl.py