Пример #1
0
    def run(self, driver, persistor):
        driver.get("http://google.com")
        driver.find_element_by_xpath("//input[@title='Pesquisar']").send_keys(
            "Hello world")

        doc = Document(1, "The quick brown fox jumps over the lazy dog")
        persistor.save_one(doc)
Пример #2
0
def test_make_valid_path_duplicated_id():
    document = Document("dummy", "Hello world")
    base_path = tmpFile("scrappy")

    if not path.exists(tmpFile("scrappy/dummy")):
        open(tmpFile("scrappy/dummy"), 'a').close()

    assert make_valid_path(
        base_path, document)[:33] == tmpFile("scrappy/duplicates/document-")
Пример #3
0
def test_persistor_saves_document():
    doc = Document(1, "Hello world")
    persistor = FileSystemPersistor(gettempdir())
    persistor.start()
    persistor.save_one(doc)
    persistor.shutdown()
    persistor.join()
    assert path.exists(tmpFile("1"))
    assert open(tmpFile("1"), "r").read() == "Hello world"
    remove(tmpFile("1"))
Пример #4
0
    def run(self, driver, persistor):
        table_header = driver.find_element_by_xpath(
            "/html[1]/body[1]/div[2]/div[2]/form[2]/table[1]/caption[1]")
        n_items = int(
            re.match(r'.*\(([0-9]*)\).*', table_header.text).groups()[0])

        for i in range(n_items):
            driver.find_elements_by_css_selector(
                'td:nth-child(6) img')[i].click()
            button_text = driver.find_elements_by_css_selector(
                "#j_id_jsp_188201070_4 > table > tbody > tr:nth-child(40) > td > a"
            )[0].get_attribute("onclick")
            doc_id = re.match(r".*'id':'(\d*)'", button_text).groups()[0]
            persistor.save_one(Document(doc_id, driver.page_source))
            driver.back()
            debug('Scrapped {} of {}. ID={}'.format(i + 1, n_items, doc_id) +
                  str(i))
Пример #5
0
def make_valid_path(base_path, document):
    """Creates a valid file path from a directory and a document

    Uses globally unique IDs on name clash and tmp folder on invalid dir.

    Arguments:
        base_path {str} -- Directory where document should be saved
        document {Document} -- Document to be saved
    """
    file_path = path.join(base_path, document.id)

    if not path.exists(base_path):
        msg = """Directory at {} does not exist,
              falling back to /tmp/scrappy/orphans""".replace("\n", "")

        msg = ' '.join(msg.replace('\n', " ").split())

        debug(msg.format(base_path))

        new_file_path = tmpFile("scrappy/orphans")
        if not path.exists(new_file_path):
            makedirs(new_file_path)

        return make_valid_path(new_file_path, document)

    if path.exists(file_path):
        msg = """File at {0} already exists,
               falling back to uui and saving to {0}/duplicates"""

        msg = ' '.join(msg.replace('\n', " ").split())

        debug(msg.format(file_path))

        unique_id = "document-{}".format(str(uuid4()))
        new_base_path = path.join(base_path, "duplicates")
        new_document = Document(unique_id, document.data)
        ensure_dir(new_base_path)

        return make_valid_path(new_base_path, new_document)

    return file_path
Пример #6
0
def test_in_memory_persistor():
    p = InMemoryPersistor()
    doc = Document(1, "vinicius misael")
    p.save_one(doc)
    assert(len(p.data) == 1)
    assert(p.data[0] == doc)
Пример #7
0
def test_make_valid_path_base_path():
    document = Document("unique-id", "Hello world")
    base_path = "/1/2/3/4/5"

    assert make_valid_path(
        base_path, document) == tmpFile("scrappy/orphans/unique-id")
Пример #8
0
def test_make_valid_path_valid_arguments():
    document = Document("unique-id", "Hello world")
    base_path = "/home/vinicius"

    assert make_valid_path(base_path, document) == "/home/vinicius/unique-id"