def main(): """Get what?""" options = EdgeOptions() options.use_chromium = True # options.add_argument("headless") # options.add_argument("disable-gpu") driver = Edge(options=options) # url = 'https://www.flipkart.com/laptops-store' url = 'https://podbay.fm/p/sach-noi-danh-cho-ban/' driver.get(url) # driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") el = driver.find_element_by_tag_name('body') # el = driver.find_element(By.NAME, "Loading more").send_keys() for i in range(4): el.send_keys("webdriver" + Keys.END) sleep(3) # driver.close() # print(driver.title) # products = [] # List to store name of products # prices = [] # List to store price of product # ratings = [] # List to store ratings of product titles = [] urls = [] content = driver.page_source # soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'lxml') for a in soup.findAll('a', href=True, attrs={'class': 'jsx-1043497740'}): # for a in soup.findAll('a', href=True, attrs={'class': 'download'}): # name = a.find('div', attrs={'class': 's1Q9rs'}) # price = a.find('div', attrs={'class': '_30jeq3'}) # rating = a.find('div', attrs={'class': '_3LWZlK'}) # products.append(name.text) # prices.append(price.text) # ratings.append(rating.text) # print(a) title = a.string if title != None: print(title) titles.append(title) link = a.get('href') if link.endswith(".mp3"): print(link) urls.append(link) driver.close() print(titles) print(urls)
def checkEmailBreached(): email = input(default_color + "Email Address> " + reset) print() url = f"https://haveibeenpwned.com/unifiedsearch/{email}" options = EdgeOptions() options.use_chromium = True options.add_argument('ignore-certificate-errors') options.add_argument("--log-level=OFF") driver = Edge(options=options, executable_path=r"WebDriver\msedgedriver.exe") driver.get(url) try: dirty_response = \ driver.page_source.split( '<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">')[1] raw_json = dirty_response.split("</pre></body></html>")[0] res = json.loads(raw_json) driver.close() print(heading_color + "---------------------Check For Breached Email--------------" + reset) for i in range(len(res['Breaches'])): print( content_color + f"Name : {res['Breaches'][i]['Name']}\n" f"Title : {res['Breaches'][i]['Title']}\n" f"Domain : {res['Breaches'][i]['Domain']}\n" f"Breached On : {res['Breaches'][i]['BreachDate']}\n{reset}" f"{heading_color}--------------------------------------\n{reset}" ) if res['Pastes'] is None: print(fg("red") + "[*] No Public Paste Found" + reset) else: print(heading_color + "[*] Public Paste Found\n" + reset) pastes = res["Pastes"] for i in range(len(pastes)): print(content_color + f"Source : {pastes[i]['Source']}\n" f"Title : {pastes[i]['Title']}\n" f"Date : {pastes[i]['Date']}\n" f"EmailCount : {pastes[i]['EmailCount']}\n") if pastes[i]['Source'] == "Pastebin": print( f"Paste URL : https://pastebin.com/{pastes[i]['Id']}\n{reset}" f"{heading_color}----------------------------------------------------{reset}" ) except: print( fg("red") + f"[*] The provided Email {email} is not breached!" + reset) pass
def chinahpo(hpo): # 如果使用IP池,则不进行随机等待 # s = random.randint(5, 10) # print("等待 " + str(s) + "秒") # time.sleep(s) ip = randomIP() # ip = "socks5://127.0.0.1:1080" print("使用IP " + ip) options = EdgeOptions() options.use_chromium = True options.add_argument("headless") # options.add_argument("disable-gpu") options.add_argument("--proxy-server={ip}".format(ip=ip)) options.add_argument("--disable-blink-features") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("start-maximized") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe" driver = Edge(options=options, executable_path=msedge) script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" driver.execute_script(script) UA = randomUA() # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36" driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": UA}) print(driver.execute_script("return navigator.userAgent;")) hpid = hpo.split(":")[1] url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format( hpid=hpid) try: driver.get(url) strtemp = url print("网址:", strtemp) except Exception: print("get page error", hpo) time.sleep(2) with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f: f.write(str(driver.page_source)) driver.close() fin = open("finish.txt", "a") fin.write(hpo + "\n") fin.close()
def judith(SITE): driver = Edge(PATH) driver.get(SITE) #find the element by his name, it can be by its ID or CSS tho element = driver.find_element_by_name("objetos") element.clear() #insert the apropriate tracking code and then proceeds to the next page element.send_keys(CODE) element.send_keys(Keys.RETURN) #must implement total headless mode and print in terminal the state #it needs to go here #in this exact spot #save a screenshot of the tracking progress bc im lazy right now to implement a callback terminal function driver.save_screenshot('consulta.png') #if the code is wrong, it'll show this output assert "Sem bagulhos encontrados, mano." not in driver.page_source #close the program and the browser window. BEWARE driver.close()
def verify_account(url): runner = cfscrape.create_scraper() options = EdgeOptions() options.use_chromium = True options.add_argument("disable-gpu") options.add_argument('headless') options.add_argument('ignore-certificate-errors') driver = Edge(options=options, executable_path=r"WebDriver\msedgedriver.exe") driver.get(url) link = driver.find_element_by_xpath( r"//a[@style='text-decoration: none; color: #1ed760']").get_attribute( 'href') driver.close() result = runner.get(url=link).text if "all set" in result: window["Verified_Email"].print(f"[*] Verification Completed") else: window["Verified_Email"].print(f"[*] {result}")
mswebdriverpath = parseargs.getMSWebDriverPath() number_of_searches = parseargs.getNumSearches() start_number = parseargs.getStartNum() #open config.json and get user_data_dir config_path = os.path.join('config', 'config-' + socket.gethostname() + '.json') with open(config_path) as json_data_file: config = json.load(json_data_file) user_data_dir = config["user.data.dir.edge"] print("user.data.dir.edge: " + user_data_dir) #open edge and get going! desired_cap = { "args": ["userDataDir=/tmp/temp_profile"], "userDataDir": "/tmp/temp_profile" } browser = Edge(executable_path=mswebdriverpath, capabilities=desired_cap) #go to Bing browser.get("http://www.bing.com") searchText = "test" SearchUtil.runSearches(browser, searchText, number_of_searches, start_number) browser.close()
class QCourse: def __init__(self): # 初始化options self.prefs = {"download.default_directory": os.getcwd()} self.options = EdgeOptions() self.options.use_chromium = True self.options.add_argument("log-level=3") self.options.add_experimental_option('excludeSwitches', ['enable-logging']) self.options.add_experimental_option('prefs', self.prefs) self.options.add_argument("--mute-audio") self.login_url = 'https://ke.qq.com/' # Mac 下配置 options 报错,故扔掉了。如果是 Windows,请使用路径下面的 msedgedriver.exe。(注释掉下面一行,放开下下行) self.driver = Edge(executable_path=os.path.join( BASE_DIR, 'msedgedriver'), capabilities={}) # self.driver = Edge(executable_path='msedgedriver.exe', options=self.options) # self.driver = Edge(executable_path=os.path.join(BASE_DIR, 'msedgedriver'), capabilities=desired_cap, options=self.options) def login(self): self.driver.get('https://ke.qq.com/') self.driver.find_element_by_id('js_login').click() time.sleep(1) WebDriverWait(self.driver, 300).until_not( EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask'))) dictCookies = self.driver.get_cookies() jsonCookies = json.dumps(dictCookies) with open('cookies.json', 'w') as f: f.write(jsonCookies) print('登陆成功!') def close(self): self.driver.close() def _get_video(self, video_url=None, path=None, index=None): if not video_url: print('请输入视频url!') # 跳转一次没法跳转,可能是设置了preventDefault self.driver.get(video_url) self.driver.get(video_url) try: # 等待视频开始播放 WebDriverWait(self.driver, 60).until( EC.presence_of_element_located((By.CLASS_NAME, 'loki-time'))) WebDriverWait( self.driver, 60).until_not(lambda driver: driver.find_element_by_class_name( 'loki-time').get_attribute("innerHTML") == '00:00 / 00:00') title = self.driver.title if index is not None: title = "{:02}_{}".format(index, title) networks = self.driver.execute_script( 'return window.performance.getEntries()') ts_url = key_url = '' for network in networks: if '.ts?start' in network.get('name'): ts_url = network.get('name') elif 'get_dk' in network.get('name'): key_url = network.get('name') download_single(ts_url, key_url, title, path) except TimeoutException: # 如果超时,可能是下载的资料,则查看是否有下载按钮,有的话,就下载 title = self.driver.title try: down_btn = self.driver.find_element_by_class_name( 'download-btn') if down_btn.text == '下载资料': url = down_btn.get_attribute('href') download_zip_doc(url, title, path) except Exception: print('没有找到视频,也没有找到可下载的文件,可能是还未开课') def get_video(self, video_url=None, path=None, index=None): if isinstance(video_url, list): for url in video_url: if url: self._get_video(url, path, index) else: self._get_video(video_url, path, index) def load_cookies(self): if not os.path.exists('cookies.json'): self.login() with open('cookies.json', 'r') as f: listCookies = json.loads(f.read()) self.driver.get(self.login_url) for cookie in listCookies: self.driver.add_cookie({ 'domain': '.ke.qq.com', 'httpOnly': cookie['httpOnly'], 'name': cookie['name'], 'path': '/', 'secure': cookie['secure'], 'value': cookie['value'] }) for cookie in utils.get_cookies_dic_list(): self.driver.add_cookie({ 'domain': '.ke.qq.com', 'httpOnly': False, 'name': cookie[0], 'path': '/', 'secure': False, 'value': cookie[1] })
def chooseAccount(): with open('data.txt') as json_file: data = json.load(json_file) userInfo ='account: ' + data['username'] print(userInfo) userName = data['username'] passWord = data['password'] print("link:") link = input() print("number of photos: ") amount = input() # format text and amount amount = int(amount) # auto login options = EdgeOptions() options.use_chromium = True options.add_argument('headless') driver = Edge('msedgedriver', options = options) driver.get(link) time.sleep(2) userForm = driver.find_element_by_css_selector("input[name='username']") passForm = driver.find_element_by_css_selector("input[name='password']") userForm.send_keys(userName) passForm.send_keys(passWord) driver.find_element_by_css_selector("button[type='submit']").click() time.sleep(3) driver.execute_script("document.querySelector('.sqdOP.yWX7d.y3zKF').click()") # get link image to list time.sleep(2) if amount > 1: spriteBtn = driver.find_element_by_css_selector(".coreSpriteRightChevron") list_link = [] def get_url1(): list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']") for image in list_element[:1]: src = image.get_attribute("src") list_link.append(src) def get_url2(): list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']") list_element.pop(0) for image in list_element[:1]: src = image.get_attribute("src") list_link.append(src) for x in range(0, amount+1): if (len(list_link) > 0): get_url2() else: get_url1() if len(list_link) == amount: break elif spriteBtn: spriteBtn.click() else: break time.sleep(0.5) # check old image folder exist if (os.path.isdir("./image")): rmtree("./image") # create new image folder folderPath = os.getcwd() folderPath += '\image' os.mkdir(folderPath) # clear screen clear = lambda: os.system('cls') clear() for i in tqdm(range(100)): pass print("\nnumber of photos:", len(list_link)) pos = 0 for href in list_link: print(pos+1, "DONE") imagePathResult = "./image/image_" + str(pos) + ".png" try: downloadFile(href) copy("./image/image.png", imagePathResult) except: print("error at %s" %pos+1) pos += 1 os.remove("./image/image.png") resultPath = os.getcwd() resultPath = resultPath + '\image' os.startfile(resultPath) driver.close() chooseMenu() if (os.path.isfile(path)): key = 2 else: key = 1 menu(key)
class QCourse: def __init__(self): # 初始化options self.prefs = {"download.default_directory": os.getcwd()} self.options = EdgeOptions() self.options.use_chromium = True self.options.add_argument("log-level=3") self.options.add_experimental_option('excludeSwitches', ['enable-logging']) self.options.add_experimental_option('prefs', self.prefs) self.options.add_argument("--mute-audio") self.login_url = 'https://ke.qq.com/' self.driver = Edge(executable_path='msedgedriver.exe', options=self.options) def login(self): self.driver.get('https://ke.qq.com/') self.driver.find_element_by_id('js_login').click() time.sleep(1) WebDriverWait(self.driver, 300).until_not( EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask'))) dictCookies = self.driver.get_cookies() jsonCookies = json.dumps(dictCookies) with open('cookies.json', 'w') as f: f.write(jsonCookies) print('登陆成功!') def close(self): self.driver.close() def get_video(self, video_url=None, path=None): if not video_url: print('请输入视频url!') # os.chdir(BASE_DIR) if not os.path.exists('cookies.json'): self.login() with open('cookies.json', 'r') as f: listCookies = json.loads(f.read()) self.driver.get(video_url) for cookie in listCookies: self.driver.add_cookie({ 'domain': '.ke.qq.com', 'httpOnly': cookie['httpOnly'], 'name': cookie['name'], 'path': '/', 'secure': cookie['secure'], 'value': cookie['value'] }) self.driver.get(video_url) # 等待视频开始播放 WebDriverWait(self.driver, 300).until( EC.presence_of_element_located((By.CLASS_NAME, 'loki-time'))) WebDriverWait( self.driver, 300).until_not(lambda driver: driver.find_element_by_class_name( 'loki-time').get_attribute("innerHTML") == '00:00 / 00:00') networks = self.driver.execute_script( 'return window.performance.getEntries()') ts_url = key_url = '' for network in networks: if '.ts?start' in network.get('name'): ts_url = network.get('name') elif 'get_dk' in network.get('name'): key_url = network.get('name') title = self.driver.title # catalog = self.driver.execute_script('return document.getElementsByClassName("task-item task-info active")' # '[0].parentNode.firstElementChild.innerText') # os.chdir(os.path.join(os.getcwd(), catalog)) download_single(ts_url, key_url, title, path)
def chinahpo(hpo_queue): while hpo_queue.empty() is not True: hpo = hpo_queue.get() # 如果使用IP池,则不进行随机等待 s = random.randint(5, 10) print(hpo, "等待 " + str(s) + "秒") time.sleep(s) ip = randomIP() # ip = "socks5://127.0.0.1:1080" hpo_ip = hpo + "\t" + ip print(hpo_ip) options = EdgeOptions() options.use_chromium = True options.add_argument("headless") # options.add_argument("disable-gpu") options.add_argument("--proxy-server=http://{ip}".format(ip=ip)) options.add_argument("--disable-blink-features") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("start-maximized") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) geo = get_timezone_geolocation(ip) print(geo) geo_json = {"latitude": geo[1], "longitude": geo[2], "accuracy": 1} timezone = {"timezoneId": geo[0]} preferences = { "webrtc.ip_handling_policy": "disable_non_proxied_udp", "webrtc.multiple_routes_enabled": False, "webrtc.nonproxied_udp_enabled": False } options.add_experimental_option("prefs", preferences) msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe" driver = Edge(options=options, executable_path=msedge) script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" driver.execute_script(script) UA = UserAgent().random # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36" driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": UA}) driver.execute_cdp_cmd("Emulation.setGeolocationOverride", geo_json) driver.execute_cdp_cmd("Emulation.setTimezoneOverride", timezone) print(driver.execute_script("return navigator.userAgent;")) hpid = hpo.split(":")[1] url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format( hpid=hpid) try: driver.get(url) strtemp = url print("网址:", strtemp) except Exception: print("get page error", hpo) time.sleep(2) with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f: f.write(str(driver.page_source)) driver.close() fin = open("finish.txt", "a") fin.write(hpo + "\n") fin.close() size = getDocSize("html2/hp_" + hpid + ".html") if 9000 <= size <= 15000: checkIP = open("ip_check_better.txt", "a") checkIP.write(hpo_ip + "\n") checkIP.close()
def main(): args = sys.argv f = open(args[4], "r") Lines = f.readlines() names, profession, nationality, job = [], [], [], [] for line in Lines: array = line.split(",") names.append(array[0]) profession.append(array[1]) nationality.append(array[2]) job.append(array[3].replace("\n", "")) for name in names: print("Query:", name, ".\nProcessing...") user = '******' search_term = f'{name} filter:verified' options = EdgeOptions() options.use_chromium = True driver = Edge(options=options) driver.get('https://www.twitter.com/login') driver.maximize_window() sleep(2) username = driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') username.send_keys(user) password = driver.find_element_by_xpath( '//input[@name="session[password]"]') password.send_keys('donkey123') password.send_keys(Keys.RETURN) sleep(1) search_input = driver.find_element_by_xpath( '//input[@aria-label="Search query"]') search_input.send_keys(search_term) search_input.send_keys(Keys.RETURN) sleep(1) driver.find_element_by_link_text('People').click() sleep(3) driver.find_element_by_xpath( '//div[@class="css-1dbjc4n r-j7yic r-qklmqi r-1adg3ll r-1ny4l3l"]' ).click() sleep(3) data = [] tweet_data = [] start = 0 end = 500 for i in range(0, 5): sleep(1) cards = driver.find_elements_by_xpath( '//div[@data-testid="tweet"]') card = cards[i] tweet = get_tweet_data(card) for card in cards: data = get_tweet_data(card) if data: tweet_data.append(data) driver.execute_script(f'window.scrollTo({start},{end});') start += 500 end += 500 driver.close() tweets = set(tweet_data) write_to_csv(name, tweets) df = pd.read_csv(f'{name}.csv') Twitter_sentiment = Twitter_sentiment_model(df) Twitter_toxic = Twitter_toxic_model(df) Big5 = Big5_model(df) create_report(name, tweets, Twitter_sentiment, Twitter_toxic, Big5)
class Sei: __area_inicial = None __windows_before = 0 __windows_after = 0 def __init__(self, headless=False, executable_path='chromedriver'): if 'chromedriver' in executable_path: chrome_options = Options() chrome_options.add_argument('--enable-javascript') chrome_options.add_argument('--window-size=1440,900') chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--proxy-server='direct://'") chrome_options.add_argument("--proxy-bypass-list=*") chrome_options.add_argument("--start-maximized") chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--ignore-certificate-errors') if headless: chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(executable_path=executable_path, options=chrome_options) elif 'msedgedriver' in executable_path: edge_options = EdgeOptions() edge_options.use_chromium = True edge_options.add_argument('enable-javascript') edge_options.add_argument('window-size=1440,900') edge_options.add_argument("disable-extensions") edge_options.add_argument("proxy-server='direct://'") edge_options.add_argument("proxy-bypass-list=*") edge_options.add_argument("start-maximized") edge_options.add_argument('disable-dev-shm-usage') edge_options.add_argument('no-sandbox') edge_options.add_argument('ignore-certificate-errors') if headless: edge_options.add_argument('headless') edge_options.add_argument('disable-gpu') self.driver = Edge(executable_path=executable_path, options=edge_options) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def start_driver(self, url, usuario=None, senha=None): if usuario == None: usuario = input('Digite o usuário: ') if senha == None: senha = getpass('Digite a senha: ') self.driver.get(url) usuario_field = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "txtUsuario"))) senha_field = self.driver.find_element_by_id('pwdSenha') botao_acessar = self.driver.find_element_by_id('sbmLogin') usuario_field.clear() usuario_field.send_keys(usuario) senha_field.clear() senha_field.send_keys(senha) botao_acessar.click() alerta = self.fechar_alerta() if alerta: raise Exception(alerta) # usuário ou senha inválido self.__area_incial = self.get_area() def go_to(self, numero_sei): if self.__windows_after > self.__windows_before: self.driver.close() self.driver.switch_to.window( self.driver.window_handles[self.__windows_before - 1]) self.driver.switch_to.default_content() pesquisa = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "txtPesquisaRapida"))) pesquisa.clear() pesquisa.send_keys(str(numero_sei)) formPesquisaRapida = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.ID, "frmProtocoloPesquisaRapida"))) self.__windows_before = len(self.driver.window_handles) formPesquisaRapida.submit() self.__windows_after = len(self.driver.window_handles) if self.__windows_after > self.__windows_before: self.driver.switch_to.window( self.driver.window_handles[self.__windows_after - 1]) def is_processo_aberto(self, area=None, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() try: ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) informacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divInformacao"))) mensagem = informacao.text aberto = 'aberto' in mensagem if area: regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$' matches = search(regex, mensagem) if matches: aberto = True else: aberto = False self.driver.switch_to.default_content() except: aberto = None mensagem = 'Impossível abrir mensagem do processo' return aberto, mensagem def get_processo_anexador(self, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) informacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divInformacao"))) procAnex = None if 'Processo anexado ao processo' in informacao.text: processoAnexador = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.XPATH, "//*[@id=\"divInformacao\"]/div/a"))) procAnex = processoAnexador.text self.driver.switch_to.default_content() return procAnex def get_area(self): self.driver.switch_to.default_content() select = Select(self.driver.find_element_by_id('selInfraUnidades')) return select.all_selected_options[0].text def seleciona_area(self, area): self.driver.switch_to.default_content() select = Select(self.driver.find_element_by_id('selInfraUnidades')) all_selected_options = select.all_selected_options for option in all_selected_options: if area == option.text: return True select = Select(self.driver.find_element_by_id('selInfraUnidades')) options = select.options for option in options: if area == option.text: select.select_by_visible_text(area) Select( WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.ID, 'selInfraUnidades')))) return True return False def clicar_botao(self, botao): self.driver.switch_to.default_content() ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) arvore = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divArvoreAcoes"))) botoes = arvore.find_elements(By.XPATH, '//*[@id=\"divArvoreAcoes\"]/a') for b in botoes: img = b.find_element(By.XPATH, 'img') if botao in img.get_attribute('title'): b.click() try: WebDriverWait(self.driver, 1).until( EC.alert_is_present(), 'Timed out waiting for PA creation ' + 'confirmation popup to appear.') except: try: self.driver.switch_to.default_content() except: None return True return False def fechar_alerta(self): alerta = None try: WebDriverWait(self.driver, 3).until( EC.alert_is_present(), 'Timed out waiting for PA creation ' + 'confirmation popup to appear.') alert = self.driver.switch_to.alert alerta = alert.text alert.accept() self.driver.switch_to.default_content() except TimeoutException: None return alerta def is_sobrestado(self, area=None, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) informacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divInformacao"))) sobrestado = 'sobrestado' in informacao.text mensagem = informacao.text self.driver.switch_to.default_content() if area: regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$' matches = search(regex, informacao.text) return sobrestado, matches != None else: return sobrestado, mensagem def sobrestar_processo(self, motivo, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() if self.clicar_botao('Sobrestar Processo'): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) self.driver.find_element(By.ID, 'divOptSomenteSobrestar').click() motivoField = self.driver.find_element(By.ID, 'txaMotivo') motivoField.clear() motivoField.send_keys(motivo) self.driver.find_element(By.ID, 'sbmSalvar').click() self.driver.switch_to.default_content() return True return False def remover_sobrestamento(self, processo=None): if processo: self.go_to(processo) if self.clicar_botao('Remover Sobrestamento do Processo'): self.fechar_alerta() return True return False def publicar(self, resumo_ementa, data_disponibilizacao, documento=None, dou=False, secao=None, pagina=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() if self.clicar_botao('Agendar Publicação'): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) resumo_ementa_text_field = self.driver.find_element( By.ID, 'txaResumo') resumo_ementa_text_field.clear() resumo_ementa_text_field.send_keys(resumo_ementa) disponibilizacao = self.driver.find_element( By.ID, 'txtDisponibilizacao') disponibilizacao.clear() disponibilizacao.send_keys(data_disponibilizacao) if dou: select = Select(self.driver.find_element_by_id('selVeiculoIO')) select.select_by_visible_text('DOU') select = Select( WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "selSecaoIO")))) WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "option[value='" + secao if secao else '3' + "']"))) select.select_by_visible_text(secao if secao else '3') pagina_text_field = self.driver.find_element( By.ID, 'txtPaginaIO') pagina_text_field.clear() pagina_text_field.send_keys(pagina if pagina else '') disponibilizacao = self.driver.find_element(By.ID, 'txtDataIO') disponibilizacao.clear() disponibilizacao.send_keys(data_disponibilizacao) self.driver.find_element_by_id('btnSalvar').click() self.driver.switch_to.default_content() return True return False def get_conteudo_documento(self, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) documento_conteudo = self.driver.find_element_by_xpath( '/html/body').get_attribute('innerHTML') documento_conteudo = sub( r'\\n', '', documento_conteudo) # retirar quebra de páginas documento_conteudo = sub(r'\s\s+?', ' ', documento_conteudo) # tira espaços duplos documento_conteudo = sub(r' ', ' ', documento_conteudo) # tira espaços duplos documento_conteudo = documento_conteudo.strip( ) # retirar quebras de páginas que tenham restado return documento_conteudo except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_element_by_id(self, id, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) return self.driver.find_element_by_id(id).text except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_elements_by_id(self, id, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) elements = self.driver.find_elements_by_id(id) return [element.text for element in elements] except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_element_by_xpath(self, xpath, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) return self.driver.find_element_by_xpath(xpath).text except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_elements_by_xpath(self, xpath, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) elements = self.driver.find_elements_by_xpath(xpath) return [element.text for element in elements] except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def close(self, voltar=True): if voltar: self.seleciona_area(self.__area_incial) self.driver.close() self.driver.quit()
def main(): try: print("ウェブドライバーを立ち上げています・・・") port = str(args.port[0]) load_delay_time = args.load_delay_time[0] options = EdgeOptions() options.use_chromium = True driver = Edge(options=options) driver.maximize_window() if len(port) != 4: print("入力した番号は4桁ではないです。4桁のポート番号を記入してください。") quit() print("ページを開いています・・・") driver.get(f"http://127.0.0.1:{port}") print(f"ページの読み込みのため{str(load_delay_time)}秒待機します・・・") for i in range(load_delay_time, 0, -1): time.sleep(1) print(f"終わるまで{i}秒") print("Interactive Pythonコンソールを立ち上げています・・・") soup = BeautifulSoup(driver.page_source, features="lxml") #Define web elements to be tested as dictionary where element ids are the keys. test_element_ids = { "dtFilter": { "tag": "select", "click_el_xpath": "/html/body/div/div[1]/div[2]/div/div/div[1]/div/div/div[1]/div[1]/div/div/div" }, "maxAmount": { "tag": "input", }, "maxSigma": { "tag": "input", }, "pl": { "tag": "select", "click_el_xpath": "/html/body/div/div[1]/div[2]/div/div/div[1]/div/div/div[1]/div[5]/div/div/div" }, "reason": { "tag": "select", "click_el_xpath": "/html/body/div/div[1]/div[2]/div/div/div[1]/div/div/div[1]/div[6]/div/div/div/div[1]" } } for test_el_id in test_element_ids: test_el = test_element_ids[test_el_id] if test_el["tag"] == "select": el = driver.find_element_by_xpath(test_el["click_el_xpath"]) el.click() soup = BeautifulSoup(driver.page_source, features="lxml") select_items = [ tag.text for tag in soup.find( id=test_el_id).find_next_sibling().select("div.option") ] print(f"number of items in select box: {len(select_items)}") for select_item in select_items: click_el = driver.find_element_by_css_selector( f"[data-value='{select_item}']") el.click() click_el.click() time.sleep(5) elif test_el["tag"] == "input": test_round = 1 while test_round < 6: test_input_number = int(random.random() * random.choice([ 10, 100, 1000, 10000, 100000, 1000000, 10000000, 10000000, 100000000 ])) el = driver.find_element_by_id(test_el_id) el.clear() el.click() el.send_keys(test_input_number) time.sleep(5) test_round += 1 el.clear() el.send_keys(0) except Exception as e: print( f"(EXCEPT) An error occurred: {str(e)} Attempting to enter debug mode at point of error." ) embed() finally: print("プログラムが正常終了しました。ウェブドライバーを終了します。お疲れ様でした。") embed() driver.close()
class Settings: """Defines the prerequisites and starts the scraping process. The arguments represent the default values for each instance variable. Setting the user's LinkedIn credentials directly from the class variables is allowed, they're also accessible via an external module, or use the os module to access them from your system environment variables. Args: df_dir(str): Directory of the original file containing the dataframe. df_read(str): Relative path of the original dataframe file. df_file_name(str): Name for the dataframe output file. cookies_path(str): Path where your LinkedIn session cookies are stored. driver_path(str): Webdriver's path. Attributes: email(str): User's LinkedIn email or phone number. password(str): User's LinkedIn password. self.read_file(str): Existing file containing pandas readable dataframe. """ email: Optional[str] = None password: Optional[str] = None read_file: Optional[str] = None driver_path: Optional[str] = None def __init__(self, df_dir='src/data', df_path='data/dataframe.csv', df_file_name='new_dataframe.csv', cookies_path='src/cookies/cookies.pkl', driver_path='', _driver=None, _check=False): self.df_dir = df_dir self.df_path = df_path if self.read_file is None else f'data/{self.read_file}' self.df_file_name = df_file_name self.cookies_path = cookies_path self.driver_path = driver_path self.driver = _driver self.check = _check if self.driver is None: # Selenium capabilities and other settings options = Options() # Options for microsoft edge (chromium) edge_options = EdgeOptions() edge_options.use_chromium = True edge_options.add_argument('log-level=3') edge_options.add_argument('lang=en') edge_options.add_argument('--start-maximized') # Main webdriver self.driver = Edge(executable_path=self.driver_path, options=edge_options) self._check_connection() def _check_connection(self, tries=0, max_try=10, err=None): """Make initial network stability check. """ while not self.check: if tries == max_try: break try: requests.get('https://www.google.com/') self.driver.get('https://www.google.com/') while 'Google' not in self.driver.title: time.sleep(0.1) self.check = True except Exception as e: err = e tries += 1 if not self.check: print(f'({__name__}) Tries: {max_try}', err) self.driver.close() else: self.scraper_logic_handler() def csv_to_df(self, row): """Convert bytes in memory buffer (i.e. csv file data) to a pandas dataframe. """ output = pd.read_csv(io.BytesIO(row)) return output def edit_dataframe(self, row, df, scraped_info): """Edit the active dataframe with the scraped information. """ headers = [ 'URL', 'title', 'role', 'current company', 'location', 'website', 'twitter', 'email', 'industry', 'company url', 'company size', 'specialties' ] for header, info in zip(headers, scraped_info): info_index = scraped_info.index(info) if info_index == 4 and any(map(str.isdigit, str(info))): scraped_info[info_index] = 'None' df.at[row, header] = str(scraped_info[info_index]) return df def define_search(self, row, df, site='site:linkedin.com', search=[]): """Take a row from the dataframe and convert it to a search value. """ search.clear() search = df.iloc[row] name = search[:].values[0] search = (f'\t{search[:].values[1]}\t{search[:].values[2]}').split() search.insert(0, name) search.insert(len(search) - 1 + 1, site + f' intitle:{name}') return search def select_pandas_io(self, df_file): """Determine the df_file extention for which pandas I/O to use. """ pandas_io_dict = {'.csv': self.csv_to_df} _, ext = os.path.splitext(df_file) row = pkgutil.get_data('src', df_file) df = pandas_io_dict[ext](row) return df def repack_to_csv(self, df): """Write the dataframe to a file. """ try: new_df_path = os.path.join(self.df_dir, self.df_file_name) _ = df.to_csv(new_df_path, index=False) return True except Exception as e: print(e) return False def scraper_logic_handler(self): """Main handler for the entire scraping process. """ username = self.email password = self.password cookies = self.cookies_path driver = self.driver ready = self.check scraped_info = None if ready: print('''\nGeneral tip: Do not minimize the webdriver while it is running. This will allow some elements to properly load.''') df = self.select_pandas_io(self.df_path) session = SignIn.sign_in(driver, username, password, cookies) while session is None: time.sleep(0.1) for row in df.index: if df.at[row, 'URL'] == 'None': search = self.define_search(row, df) scraped_info = Scrape(driver, search).startstop() df = self.edit_dataframe(row, df, scraped_info) else: pass if self.repack_to_csv(df): driver.close() @classmethod def start_scraper(cls, user_email=None, user_password=None, read_file='', driver_path=''): if isinstance(user_email, str) and isinstance(user_password, str) and isinstance( read_file, str) and isinstance(driver_path, str): cls.email = user_email cls.password = user_password cls.read_file = read_file cls.driver_path = driver_path row = cls() return row else: print( f'({__name__}) Invalid DataType: {user_email, type(user_email)}, {user_password, type(user_password)}, {read_file, type(read_file)}, {driver_path, type(driver_path)}' )
else: idtweet = list(tweet) idtweet[2]=str(idtweet[2]) tweet_id = ''.join(idtweet) if tweet_id not in tweet_ids: tweet_ids.add(tweet_id) data.append(tweet) scroll_attempt = 0 while True: # check scroll position driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') sleep(2) curr_position = driver.execute_script("return window.pageYOffset;") if last_position == curr_position: scroll_attempt += 1 # end of scroll region if scroll_attempt >= 3: scrolling = False break else: sleep(2) # attempt another scroll else: last_position = curr_position break # close the web driver driver.close()
class ReservationEngine: def __init__(self, email, password, headless=True): self.email = email self.password = password self.available = False self.booked = False self.reservations_left = False options = EdgeOptions() options.add_argument("--log-level=3") options.use_chromium = True if headless: options.add_argument("headless") self.driver = Edge(options=options) print("Starting web driver...") def remove_overlay(self): #get rid of cc overlay buttons = self.driver.find_elements_by_css_selector("a.cc-btn") while any(map(lambda x: x.size["height"] != 0, buttons)): for button in buttons: try: button.click() except: pass buttons = self.driver.find_elements_by_css_selector("a.cc-btn") def login(self): #login print("Logging in") self.driver.get( "https://account.ikonpass.com/en/login?redirect_uri=/en/myaccount/add-reservations/" ) self.remove_overlay() email_box = self.driver.find_element_by_css_selector("input#email") email_box.send_keys(self.email) password_box = self.driver.find_element_by_css_selector( "input#sign-in-password") password_box.send_keys(self.password) submit = self.driver.find_element_by_css_selector("button.submit") submit.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'input.react-autosuggest__input'))) print("Logged in") def refresh(self): self.driver.refresh() def find_date(self, date, resort): WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'input.react-autosuggest__input'))) self.remove_overlay() #select resort search = self.driver.find_element_by_css_selector( "input.react-autosuggest__input") search.send_keys(resort) button = self.driver.find_element_by_css_selector( "li#react-autowhatever-resort-picker-section-1-item-0") button.click() button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Continue')]") button.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.DayPicker-wrapper'))) self.remove_overlay() #select date datepicker = self.driver.find_element_by_css_selector( "div.DayPicker-wrapper") month_selected = False while not month_selected: month_text = calendar.month_name[date.month] month = datepicker.find_elements_by_xpath( "//span[contains(text(), " + "'" + month_text + "')]") if len(month) > 0: month_selected = True else: button = datepicker.find_element_by_class_name( "icon-chevron-right") button.click() day = datepicker.find_element_by_xpath("//div[@aria-label='" + date.strftime("%a %b %d %Y") + "']") day.click() day_classes = day.get_attribute(name="class") self.available = "past" not in day_classes and "unavailable" not in day_classes self.booked = "confirmed" in day_classes div = self.driver.find_elements_by_xpath( "//div[contains(text(), 'Reservation Limit Reached')]") self.reservations_left = len(div) == 0 print("Date Selected: " + date.strftime("%m/%d/%Y")) def reserve(self): #confirm reservation if available if self.available and not self.booked and self.reservations_left: self.remove_overlay() button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Save')]") button.click() button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Continue to Confirm')]") button.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//input[@type='checkbox']"))) button = self.driver.find_element_by_xpath( "//input[@type='checkbox']") button.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[contains(text(), 'Confirm Reservations')]"))) button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Confirm Reservations')]") button.click() self.booked = True print("Booked") return self.booked def log_results(self, log_file_name): #log with open(log_file_name, "a") as f: f.write(datetime.now().strftime("%m/%d/%Y, %H:%M:%S")) f.write(": Available - %r, Booked - %r, Reservations Left- %r" % (self.available, self.booked, self.reservations_left)) f.write("\n") def close_driver(self): self.driver.close()
class CnblogsSpider(scrapy.Spider): # spider name name = 'cnblogs' # allowed domains allowed_domains = ['i.cnblogs.com', 'www.cnblogs.com'] # list pages start_urls = [ 'https://i.cnblogs.com/api/posts/list?p=1&cid=&tid=&t=1&cfg=0&search=' ] page_size = 100 # cookies cn_blogs_cookie = { '.CNBlogsCookie': 'D1BE43FCE6861944C20286B08281F79D48AA2C47E3144A7E4E9429AE26B66C17071AFADFBDB7F45E7D85583CF6AA07CA0CCD6512B0DC01B7C5D5CB774D867B3E70A3FFC843EA90AF218C30B44D1979320533B0D6D9C4E6BBC5FBC337ED7E1663E832CC7A' } @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s # init browser driver in constructor # 将driver创建在中间件的初始化方法中,适合项目中只有一个爬虫。 # 爬虫项目中有多个爬虫文件的话,将driver对象的创建放在每一个爬虫文件中。 def spider_opened(self, spider): # # 在scrapy中创建driver对象,尽可能少的创建该对象。 # # 1. 在初始化方法中创建driver对象; # # 2. 在open_spider中创建driver对象; # # 3. 不要将driver对象的创建放在process_request(); options = EdgeOptions() # # 使用谷歌内核(加了反而报错,说chrome is not reachable,并且此时driver名字必须为msedgedriver.exe,正常应该必须是MicrosoftWebDriver.exe) # options.use_chromium = True # 浏览器可执行文件绝对路径 - 手动指定使用的浏览器位置 options.binary_location = r"MicrosoftWebDriver.exe" # options.add_argument("--remote-debugging-port=59692") # # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 # options.headless = True # options.add_argument("--headless") # “–no-sandbox”参数是让Chrome在root权限下跑 # options.add_argument('--no-sandbox') # options.add_argument('--disable-dev-shm-usage') # 谷歌文档提到需要加上这个属性来规避bug # options.add_argument("disable-gpu") # 隐私模式 # options.add_argument("-inprivate") options.add_argument( "user-data-dir=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data" ) options.add_argument( "profile-directory=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data\\Default" ) # options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" self.driver = Edge(options=options) # give time to login manually self.driver.get('https://i.cnblogs.com/posts?pageSize=100') time.sleep(30) spider.logger.info('Spider opened: %s' % spider.name) def spider_closed(self, spider): self.driver.close() spider.logger.info('Spider opened: %s' % spider.name) def start_requests(self): for url in self.start_urls: yield scrapy.Request( url, # cookies=self.cn_blogs_cookie, callback=self.parse_list_api) def parse_list_api(self, response): post_list_json_obj = json.loads(response.xpath('//pre/text()').get()) posts_count = math.ceil(post_list_json_obj['postsCount'] / self.page_size) for i in range(posts_count): try: post_list_page_url = 'https://i.cnblogs.com/posts?pageSize=' \ + str(self.page_size) + '&page=' + str(i + 1) yield scrapy.Request( post_list_page_url, # cookies=self.cn_blogs_cookie, callback=self.parse_list_page) except (IndexError, TypeError): continue def parse_list_page(self, response): post_list_items = response.xpath('//a[contains(@class,"entry")]') for postListItem in post_list_items: try: view_page_url = postListItem.xpath('@href').extract_first() if view_page_url: if view_page_url.index('//') == 0: view_page_url = 'http:' + view_page_url yield scrapy.Request( view_page_url, # cookies=self.cn_blogs_cookie, callback=self.parse_view_page) except (IndexError, TypeError): continue def parse_view_page(self, response): md_page_url_node = response.xpath('//a[text()=\'MD\']') if md_page_url_node: md_page_url = md_page_url_node.xpath('@href').extract_first() post_title = response.xpath( '//a[@id="cb_post_title_url"]/span/text()').get() if md_page_url: yield scrapy.Request( md_page_url, callback=self.parse_md_page, # cookies=self.cn_blogs_cookie, meta={'item': post_title}) def parse_md_page(self, response): post_title = response.meta['item'] post_content = response.xpath('//pre').xpath('text()').extract_first() if post_content[0:2] == '\r\n': post_content = post_content[2:len(post_content)] item = CnblogsPostItem() item['title'] = post_title item['content'] = post_content yield item