def run(self, driver, persistor): driver.get("http://google.com") driver.find_element_by_xpath("//input[@title='Pesquisar']").send_keys( "Hello world") doc = Document(1, "The quick brown fox jumps over the lazy dog") persistor.save_one(doc)
def test_make_valid_path_duplicated_id(): document = Document("dummy", "Hello world") base_path = tmpFile("scrappy") if not path.exists(tmpFile("scrappy/dummy")): open(tmpFile("scrappy/dummy"), 'a').close() assert make_valid_path( base_path, document)[:33] == tmpFile("scrappy/duplicates/document-")
def test_persistor_saves_document(): doc = Document(1, "Hello world") persistor = FileSystemPersistor(gettempdir()) persistor.start() persistor.save_one(doc) persistor.shutdown() persistor.join() assert path.exists(tmpFile("1")) assert open(tmpFile("1"), "r").read() == "Hello world" remove(tmpFile("1"))
def run(self, driver, persistor): table_header = driver.find_element_by_xpath( "/html[1]/body[1]/div[2]/div[2]/form[2]/table[1]/caption[1]") n_items = int( re.match(r'.*\(([0-9]*)\).*', table_header.text).groups()[0]) for i in range(n_items): driver.find_elements_by_css_selector( 'td:nth-child(6) img')[i].click() button_text = driver.find_elements_by_css_selector( "#j_id_jsp_188201070_4 > table > tbody > tr:nth-child(40) > td > a" )[0].get_attribute("onclick") doc_id = re.match(r".*'id':'(\d*)'", button_text).groups()[0] persistor.save_one(Document(doc_id, driver.page_source)) driver.back() debug('Scrapped {} of {}. ID={}'.format(i + 1, n_items, doc_id) + str(i))
def make_valid_path(base_path, document): """Creates a valid file path from a directory and a document Uses globally unique IDs on name clash and tmp folder on invalid dir. Arguments: base_path {str} -- Directory where document should be saved document {Document} -- Document to be saved """ file_path = path.join(base_path, document.id) if not path.exists(base_path): msg = """Directory at {} does not exist, falling back to /tmp/scrappy/orphans""".replace("\n", "") msg = ' '.join(msg.replace('\n', " ").split()) debug(msg.format(base_path)) new_file_path = tmpFile("scrappy/orphans") if not path.exists(new_file_path): makedirs(new_file_path) return make_valid_path(new_file_path, document) if path.exists(file_path): msg = """File at {0} already exists, falling back to uui and saving to {0}/duplicates""" msg = ' '.join(msg.replace('\n', " ").split()) debug(msg.format(file_path)) unique_id = "document-{}".format(str(uuid4())) new_base_path = path.join(base_path, "duplicates") new_document = Document(unique_id, document.data) ensure_dir(new_base_path) return make_valid_path(new_base_path, new_document) return file_path
def test_in_memory_persistor(): p = InMemoryPersistor() doc = Document(1, "vinicius misael") p.save_one(doc) assert(len(p.data) == 1) assert(p.data[0] == doc)
def test_make_valid_path_base_path(): document = Document("unique-id", "Hello world") base_path = "/1/2/3/4/5" assert make_valid_path( base_path, document) == tmpFile("scrappy/orphans/unique-id")
def test_make_valid_path_valid_arguments(): document = Document("unique-id", "Hello world") base_path = "/home/vinicius" assert make_valid_path(base_path, document) == "/home/vinicius/unique-id"