Exemplo n.º 1
0
def main():
    ap = argparse.ArgumentParser()
    #ap.add_argument("url", action='store', type=str, default='http://acervos.ims.com.br/#/search?page={pagen}', nargs='?', help="URL to access, replacing {pagen} for the page number")
    ap.add_argument("database",
                    action='store',
                    type=str,
                    help="Database for the script to get its file names from")
    ap.add_argument(
        "url",
        action='store',
        type=str,
        default="http://acervos.ims.com.br/#/search?filtersStateId=5",
        nargs='?',
        help="URL to access")
    ap.add_argument("-o",
                    "--output-directory",
                    action='store',
                    type=str,
                    default=os.getcwd(),
                    help="Output directory for the downloaded images")
    ap.add_argument(
        "-d",
        "--diagonal",
        action='store',
        type=int,
        help=
        "Diagonal size for the downloaded images in pixels (they will be resized)"
    )  # Use -d 50
    args = ap.parse_args()

    infile_ext = args.database.split('.')[-1]
    if infile_ext == 'pickle':
        df = pd.read_pickle(args.database)
    elif infile_ext == 'xlsx':
        df = pd.read_excel(args.database)
    elif infile_ext == 'csv':
        df = pd.read_csv(args.database)
    else:
        raise ValueError("Unknown file format: %s" % infile_ext)

    if not os.path.exists(args.output_directory):
        reponse = None
        while reponse not in 'y n'.split():
            response = input(
                "The given output directory does not exist. Create it? [Y/N]  "
            ).lower()
        if response == 'y':
            os.mkdir(args.output_directory)
        else:
            return 0
    elif not os.path.isdir(args.output_directory):
        print("ERROR: The given output directory is not a directory.")
        return 1
    else:
        if len(os.listdir(args.output_directory)) > 0:
            print(
                "The given output directory is not empty. What would you like to do?"
            )
            print("  1      empty it")
            print("  2      continue")
            print("  3      exit")
            response = None
            while response not in '1 2 3'.split():
                response = input("[1/2/3]  ").lower()
            if response == '1':
                shutil.rmtree(args.output_directory)
                os.mkdir(args.output_directory)
            elif response == '3':
                return 0

    browser = webdriver.Chrome()
    # Get the webpage
    browser.get(args.url)
    input("Filter the page for only photographs, then hit <ENTER>.  ")
    # Wait for it to load properly
    try:
        element_present = expected_conditions.presence_of_element_located(
            (By.CLASS_NAME, 'img-asset-thumbnail'))
        WebDriverWait(browser, 20).until(element_present)
    except TimeoutException:
        print("timed out")

    pg = ProgressBar(len(df['file name']))

    for _, row in df.iterrows():
        # Get search input
        searchbar = browser.find_element_by_id('search')
        # Search the file name
        searchbar.clear()
        searchbar.send_keys(row['file name'])
        searchbar.send_keys(Keys.RETURN)

        time.sleep(0.2)

        # Wait for it to load properly
        try:
            done_loading = CursorNotWaiting(browser)
            WebDriverWait(browser, 20).until(done_loading)
            element_present = expected_conditions.presence_of_element_located(
                (By.CLASS_NAME, 'img-asset-thumbnail'))
            WebDriverWait(browser, 20).until(element_present)
            is_clickable = expected_conditions.element_to_be_clickable(
                (By.CLASS_NAME, 'img-asset-thumbnail'))
            WebDriverWait(browser, 20).until(is_clickable)
            #element_visible = expected_conditions.invisibility_of_element_located((By.CLASS_NAME, 'img-asset-thumbnail'))
            #WebDriverWait(browser, 20).until(element_visible)
        except TimeoutException:
            print("\atimed out")

        time.sleep(2)

        # Find all images of interest
        while True:
            try:
                items = browser.find_elements_by_class_name(
                    'img-asset-thumbnail')
                if len(items) > 1:
                    print(
                        "\n\aWARNING: multiple images were found for entry %s."
                        % row['file name'])
                for item in items:
                    try:
                        item.click()
                    except (ElementClickInterceptedException,
                            ElementNotInteractableException):
                        time.sleep(0.2)
                        continue

                    try:
                        element_present = expected_conditions.presence_of_element_located(
                            (By.ID, 'preview-img'))
                        WebDriverWait(browser, 20).until(element_present)
                    except TimeoutException:
                        print("\atimed out")

                    img = browser.find_element_by_id('preview-img')
                    src = img.get_attribute('src')
                    before = os.listdir(args.output_directory)
                    code = os.system(
                        f"wget --content-disposition --trust-server-names --quiet -P {args.output_directory} {src}"
                    )
                    if code != 0:
                        print("\afailed to download")
                        continue
                    after = os.listdir(args.output_directory)
                    filename = os.path.join(args.output_directory,
                                            (set(after) - set(before)).pop())
                    if args.diagonal is not None:
                        os.system(
                            f"convert -resample {args.diagonal} \"{filename}\" \"{filename}\""
                        )
                    pg.advance()
                break
            except (StaleElementReferenceException, NoSuchElementException):
                time.sleep(0.2)
                continue

    pg.finish()

    return 0
Exemplo n.º 2
0
from scoop import futures

import time

from progressbar import ProgressBar


def f(i):
    time.sleep(0.02)


if __name__ == '__main__':
    n = 100
    results = futures.map(f, range(n))
    bar = ProgressBar(n)
    [bar.advance(i) for i, _ in enumerate(results)]
    bar.close()