def get_page_childs(parent_url): virtual_browser = Grab() urls = [] page = open_page(virtual_browser, parent_url) if page is False: return urls all_urls = page.select(SELECTOR) for url in all_urls: link = re.search('href=(\S+)', url.html()) link = link.group(0).split('"')[1] if link.startswith('/'): link = initial_url + link urls.append({'link': link, 'parent': parent_url}) return urls
def test_task_url_and_grab_options(self): class TestSpider(Spider): def setup(self): self.done = False def task_page(self, grab, task): self.done = True bot = build_spider(TestSpider, ) bot.setup_queue() g = Grab() g.setup(url=self.server.get_url()) self.assertRaises(SpiderMisuseError, Task, 'page', grab=g, url=self.server.get_url())
def main(tag): pageId = 0 f = open(tag + '.txt', 'w') f.write(tag + ':\n') while True: g = Grab() g.setup(timeout=60, connect_timeout=60) pageId += 1 g.go(p(tag, pageId)) v1 = g.xpath_text('//title') v2 = unicode("Хабрахабр — страница не найдена (404)", 'utf-8') if v1 == v2: print 'Finished at page: ' + str(pageId) + '!' break for questionText in g.xpath_list('//a[@class="post_title"]'): f.write(smart_str(questionText.text_content().strip()) + '\n') print 'Page # ' + str(pageId) + ' parsed!'
def _do_fetch(self, title, artist): g = Grab() search_response = g.go("http://www.google.com/search?q=%s" % urllib.quote("site:nashe.com.ua %s %s" % (artist, title))).body x1 = search_response.find("http://nashe.com.ua/song.htm?id=") if x1 == -1: return [] x2 = search_response.find(r'"', x1) link = search_response[x1:x2].replace("&", "&") response = g.go(link).body x1 = response.find("'song2'>") + 8 x2 = response.find("</div>", x1) lyrics = html2text.html2text(response[x1:x2].decode("cp1251")).replace( "\n\n", '\n') sr = [lyrics] sr = map(lambda x: u"%s \nSource: nashe.com.ua" % x, sr) return sr
def test_domria(): ua = UserAgent() grab = Grab(timeout=30, connect_timeout=10, log_file='%s/vparser/tmp/pars/log.html' % os.path.split(PROJECT_PATH)[0]) grab.setup(proxy='46.148.30.216:8080', proxy_type='http', proxy_userpwd=CREDENTIALS_box) # , log_dir='vparser/tmp' # grab.go('http://kiev.ko.slando.ua/obyavlenie/predlagaetsya-v-arendu-posutochno-v-kieve-kvartira-odnokomnatnaya-po-ulits-ID75E19.html#a025724d26') grab.go( 'http://dom.ria.com/ru/realty_prodaja_dom_harkov_olhovka_stepnaya_ulitsa-8253714.html' ) # grab.go('http://kharkov.kha.slando.ua/nedvizhimost/arenda-kvartir/') print grab.doc.select( '//div[@class="item-param"]/strong[@class="phone"]').text()
def task_initial(self, grab, task): selector = '//div[@class="entry unvoted"]/ul/li[@class="first"]/a[contains(@class,"comments")]' for post in grab.doc.select(selector): post_link = grab.make_url_absolute(post.attr("href")) grab_custom = Grab() grab_custom.setup( user_agent= "User-agent:Linux:Subreddits-Scraper:1.0 by /u/kadze_yukii", url=post_link) self.add_task(Task('post', grab=grab_custom)) try: next_page = grab.make_url_absolute( grab.doc.select('//a[@rel="nofollow next"]').attr("href")) self.add_task(Task('initial', url=next_page)) except: pass
def task_generator(self): grab = Grab() grab.load_proxylist(PROXY_PATH, 'text_file', proxy_type='http', auto_init=False, auto_change=True) for link in VOCABULARY: url = link['url'] pages = xrange(1, link['pages']) cat = link['cat'] for page in pages: grab.change_proxy() grab.setup( url=url % page, proxy_userpwd=CREDENTIALS, hammer_mode=True, hammer_timeouts=((2, 5), (10, 15), (20, 30)), reuse_cookies=True ) yield Task('link_on_page', grab=grab, cat=cat)
def set_redirect_for_mobile_devices(self, request): #Здесь определять мобильный девайс и при необходсмости менять домен на мобильный headers = dict() for m in request.META: if m.startswith( 'HTTP_' ) and not m == 'HTTP_HOST' and not m == 'HTTP_CONNECTION': headers[m.replace('HTTP_', '').lower().replace('_', '-')] = request.META[m] get_params = urllib.urlencode(headers) g = Grab() g.go('http://phd.yandex.net/detect/?%s' % get_params) if g.doc.select('//yandex-mobile-info').exists(): if not self.redirect_to: self.redirect_to = 'm.%s/' % request.get_host() else: self.redirect_to = 'm.%s' % self.redirect_to
def _do_fetch(self, title, artist): g = Grab() search_response = g.go("http://www.google.com/search?q=%s" % urllib.quote("site:textypesen.com.ua %s %s" % (artist, title))).body x1 = search_response.find("http://textypesen.com.ua/") if x1 == -1: return [] x2 = search_response.find(r'"', x1) link = search_response[x1:x2].replace("&", "&") response = g.go(link).body x1 = response.find(">", response.find("align=right")) + 1 x2 = response.find("<table", x1) lyrics = html2text.html2text(response[x1:x2].decode("utf8")).replace( "\n\n", '\n') sr = [lyrics] sr = map(lambda x: u"%s \nSource: textypesen.com.ua" % x, sr) return sr
def getProductsImages(url): from grab import Grab g = Grab(log_file='productsImages.html') g.go(url) products = g.doc.select('//div/div/div/div/div[@class="catalog w"]/div[@class="items fix"]/div[@class="item"]/h3/a') products_href = [] products_images = [] for elem in products: href = elem.attr('href') href = 'http://moscross.ru/'+href products_href.append(href) for elem in products_href: products_images.append(getProductImages(elem)) return products_images
def task_generator(self): logging.debug("*****execute******") with open('directories.csv', 'rb') as f: content = csv.reader(f) directories = list(content) # directories = ['google'] total = len(directories) logging.debug("*****{}******".format(total)) i = 100 total = 102 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } test_url = 'https://www.google.com' while (True): logging.debug("Index: {}".format(i)) if i >= total: break g = Grab() g.clear_cookies() g.setup(**config) g.setup(headers=headers) logging.debug("CONFIG : {}".format(g.config)) data = dict(slug=directories[i][0], ) logging.info(data) while True: try: print "------------------------" g.go(test_url) print g.doc.body print "++++++++++++++++++++++++" break except Exception as e: print "************************" logging.debug(e) time.sleep(1) yield Task('init', grab=g, data=data) time.sleep(5) i += 1
def check_in_circle(self, url, token_id, loyalty_id): in_circle = False self.refresh_token(token_id) soc_token = SocToken.query.get(token_id) username = self.parse_username(url) request_url = "%s%s%s?key=%s" % ( self.API_PATH, self.API_PARTS['people_list'], username, SocConfig.GOOGLE_KEY) taget_user = request_helper.make_request(request_url, True) if 'id' in taget_user and taget_user['id']: user_id = taget_user['id'] g = Grab() g.setup(headers={'Authorization': 'Bearer ' + soc_token.user_token}) url_api = self.API_PATH + self.API_PARTS['peoples'] while not in_circle: g.go(url_api) circle = json.loads(g.response.body) if 'items' not in circle: break if len(circle['items']) <= 0: break for friend in circle['items']: if 'id' in friend and friend['id'] == user_id: in_circle = True if 'nextPageToken' not in circle: break if len(circle['nextPageToken']) <= 0: break url_api = "%s%s&pageToken=%s" % ( self.API_PATH, self.API_PARTS['peoples'], circle['nextPageToken']) return in_circle
def download_fb2(links, dir, limit): for link in links: g = Grab() g.go(link) genre = g.doc.select('//*[@id="main"]/h1').text() # try: # os.makedirs(genre) # except OSError: # pass book_links = g.doc.select('//*[@id="main"]/form/ol/a') i = 0 # add limit counter for book_link in book_links: if i == limit: break link = 'http://flibusta.net%s/fb2' % book_link.attr('href') name = book_link.text() book_url = urlopen(link) book = book_url.read() try: book.decode('utf-8') print('Book %s is blocked' % name) continue except UnicodeDecodeError: pass completeName = os.path.join("{0}\\{1}\\{2}.zip".format( dir, genre, name)) #if timout error, do it again while True: try: f = open(completeName, 'wb') f.write(book) f.close() i += 1 print('downloaded: ' + name) except requests.exceptions.RequestException as e: # This is the correct syntax print(e) continue break else: print('Finished %s' % genre)
def get_html(URL): #получили страницу g = Grab( url=URL, user_agent= "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11", timeout=8) try: response = g.request() time.sleep(2) # response = urllib.request.urlopen(URL, timeout=8) logging.info('получили какой то response') # return response.read() return response.unicode_body() except: logging.info( 'Сервер не ответил за 8 секунд, попробуем еще раз!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' ) return get_html(URL)
def get_html(URL): # получили страницу g = Grab( url=URL, user_agent= "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11", timeout=8) try: response = g.request() # response = urllib.request.urlopen(URL, timeout=8) logging.info( 'запросили страницу со списком объявлений и сделали суп') # return response.read() time.sleep(2) return response.unicode_body() except: logging.warning( 'сервер не ответил вовремя по запросу списка объявлений, пробуем еще раз!!!!!!!!!!!!!!!!!' ) return get_html(URL)
def scraiping_add(): # Search link on the foto pointer = Grab() pointer.setup(timeout=25, connect_timeout=25) pointer.go('http://www.photosight.ru/photos/6599649/?from=best') response_url_foto = pointer.doc.select('//img[@id = "big_photo"]/@src').text() response_text_foto = pointer.doc.select('//img[@id = "big_photo"]/@alt').text() # Load foto on the PC urllib.request.urlretrieve(response_url_foto, 'image.png') # Load foto on the Server pointer.go('https://ourfoto.herokuapp.com/add_foto/') pointer.doc.set_input('image', UploadFile('image.png')) pointer.doc.set_input('text', response_text_foto) pointer.doc.submit() # Delete foto os.remove('image.png')
def __init__(self, key, filename='', auto_run=True, grab_config=None, send_config=None, domain='antigate.com'): self.g = Grab() if grab_config: self.g.setup(**grab_config) self.key = key self.captcha_id = None self.captcha_key = None self.send_config = send_config self.domain = domain self.logger = getLogger(__name__) if auto_run and filename: self.run(filename)
def make_new_link(url): new_grab = Grab() new_link = '' if 'linkedin.com/in/' in url: return url if 'linkedin.com/profile/'in url: return url else: try: new_grab.go(url) head = str(new_grab.response.head).split('\\r\\') for x in head: if 'https' in x: new_link = x.replace('https://ua.', 'https://www.').replace(" ", "")\ .replace('nLocation:', '').strip(" ")\ .replace('https://pl.', 'https://www.') except: new_link = url return new_link
def get_categories(): grab = Grab() grab.setup(url='http://www.free-lance.ru/freelancers/') print u'Запрос страницы' grab.request() print u'Извлечение категорий' categories = grab.xpath_list('//ul[@id="accordion"]/li[not(@class)]') for category in categories: subcategories = category.xpath('./ul[@class="element"]/li/span/a') subcategories = map(lambda a: a.text_content().encode('utf-8'), subcategories) yield (category.xpath('./a')[0].text_content().encode('utf-8'), subcategories) print u'Завершено'
def test_setup_proxylist(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) with open(proxy_file, 'w') as out: out.write(content) # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() bot.add_task( Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() serv = [ x['server'] for x in self.extra_servers.values() if x['server'].request['done'] ][0] self.assertEqual(serv.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports'])))
def test_task_url_and_grab_options(self): class TestSpider(Spider): def setup(self): # pylint: disable=attribute-defined-outside-init self.done = False def task_page(self, dummy_grab, dummy_task): # pylint: disable=attribute-defined-outside-init self.done = True bot = build_spider(TestSpider, ) bot.setup_queue() grab = Grab() grab.setup(url=self.server.get_url()) self.assertRaises(SpiderMisuseError, Task, 'page', grab=grab, url=self.server.get_url())
def task_initial(self, grab, task): # Add to all <br> - '\n' raw_br_list = grab.xpath_list('//br') for item in raw_br_list: item.text = "\n" raw_text = grab.xpath_text('//*') ip_port_list = re.findall('[0-9]+[.][0-9]+[.][0-9]+[.][0-9]+[:][0-9]+', raw_text) for ip in ip_port_list: grab = Grab() grab.setup(url='http://www.icanhazip.com') grab.setup(proxy=ip, proxy_type='http', connect_timeout=10, timeout=15) info = {'server': ip, 'type': 'http'} yield Task('proxy_check', grab=grab, info=info)
def handle(self, *args, **options): g = Grab(log_dir="/tmp") site = "http://20k.com.ua" results = '' # class FirmHotline(models.Model): # class ScanHotline(models.Model): # class OneHotline(models.Model): # class ConcurentHotline(models.Model): def make_url(url): items = url.split('/') del items[-2] nurl = '/'.join(items) g.go(site + nurl) if g.response.code == 200: return nurl else: return make_url(nurl) # assert False, nurl f = open(PROJECT_ROOT + '/static/error.csv', 'r') # assert False, f.split('/n') for line in f: url = line.split(',')[0] g.go(url) if g.response.code == 404: if not '.jpg' in url: l = url.split('http://20k.com.ua')[1] sl = l.split('?')[0] if 'index.php' in l: n = '/' else: n = make_url(sl) results += l + ',' + n + '\n' self.stdout.write(l + ',' + n) f = open(PROJECT_ROOT + '/static/results.csv', 'w') f.write(results) f.close()
def get_page_status(page): url = page['link'] for bad_url in exclude_urls: if bad_url in url: return False virtual_browser = Grab() check = open_page(virtual_browser, url) if check is not False and "200 OK" not in check.status: is_visible = check_with_selenium(unicode(page.get('parent')), unicode(url)) if is_visible: write_result( unicode("{0} {1} parent page: {2}").format( unicode(check.status), unicode(url), unicode(page.get('parent')))) return False return True
def __init__(self, master=None): tk.Frame.__init__(self, master) self.grid() self.g = Grab() logging.basicConfig(level=logging.DEBUG) if self.get_api_key(): self.api_key = self.get_api_key() else: self.api_key = 'No key' self.createWidgets() if not os.path.isfile('weatherapp.db'): self.db_connect() self.country_list = [] self.country_selected = '' self.chosen_city = '' self.chosen_city_id = '' self.temp = 0 self.date = date.today().isoformat() self.weather_id = ''
def test_charset_html5(self): grab = Grab() grab.setup_document(b"<meta charset='windows-1251'>") self.assertEqual('windows-1251', grab.doc.charset) grab.setup_document(b'<meta charset="windows-1252">') self.assertEqual('windows-1252', grab.doc.charset) grab.setup_document(b'<meta charset=latin-1>') self.assertEqual('latin-1', grab.doc.charset) grab.setup_document(b"<meta charset = 'windows-1251' >") self.assertEqual('windows-1251', grab.doc.charset) grab.setup_document(b'<meta charset = "windows-1252" >') self.assertEqual('windows-1252', grab.doc.charset) grab.setup_document(b'<meta charset = latin-1 >') self.assertEqual('latin-1', grab.doc.charset)
def check_plus(self, url, token_id, loyalty_id): plused = False self.refresh_token(token_id) action = PaymentLoyalty.query.get(loyalty_id) target = json.loads(action.data) soc_token = SocToken.query.get(token_id) g = Grab() # g.setup(headers={'Authorization':'Bearer ' + soc_token.user_token}) url_api = "%s%s%s%s&key=%s" % ( self.API_PATH, self.API_PARTS['activities'], target['id'], self.API_PARTS['plusoners'], SocConfig.GOOGLE_KEY) while not plused: g.go(url_api) plusoners = json.loads(g.response.body) if 'items' not in plusoners: break if len(plusoners['items']) <= 0: break for person in plusoners['items']: if 'id' in person and person['id'] == soc_token.soc_id: plused = True if 'nextPageToken' in plusoners: break if len(plusoners['nextPageToken']) <= 0: break url_api = "%s%s%s%s&pageToken=%s&key=%s" % ( self.API_PATH, self.API_PARTS['activities'], target['id'], self.API_PARTS['plusoners'], plusoners['nextPageToken'], SocConfig.GOOGLE_KEY) return plused
def handler(self, collection, obj, set_field, base_dir, task_args=None, grab_args=None, callback=None): from database import db for image in obj.get(set_field, []): path = hashed_path(image['url'], base_dir=base_dir) if os.path.exists(path): if path != image['path']: db[collection].update( { '_id': obj['_id'], ('%s.url' % set_field): image['url'] }, {'$set': { '%s.$.path': path }}) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=image['url']) if grab_args: g.setup(**grab_args) g.setup(referer=build_image_hosting_referer(image['url'])) yield Task(callback=callback or image_set_handler, grab=g, collection=collection, path=path, obj=obj, image=image, set_field=set_field, disable_cache=True, backup=g.dump_config(), **kwargs)
def task_jsonresponse(self, grab, task): try: response = json.loads( str(grab.doc.body).replace("b'", "").replace("'", "").replace('\\', '')) except: response = None self.add_task( Task('jsonresponse', url=task.url, delay=1, region=task.region)) print('----- Ответ не похож на json -----', task.url) if response: if task.url == self.starturl: response = response[0]['children'] for resp in response: temp_id = resp['id'] temp_intid = resp['a_attr']['intid'] temp_levelid = resp['a_attr']['levelid'] if temp_levelid == '8': g = Grab(url=self.urlpattern_page + temp_intid + '?do=result', document_charset='windows-1251') yield Task('pageresponse', grab=g, urlid=temp_id, region=task.region) elif temp_levelid == '11': pass else: yield Task('jsonresponse', url=self.urlpattern_json + temp_id, region=task.region) print(self.counter, 'Отправлена в работу ссылка', temp_id, 'уровень', temp_levelid) self.counter += 1 else: print('----- Похоже пустой json -----', task.url)
def scan(starting_url): g = Grab() urls_queue = collections.deque() urls_queue.append(starting_url) found_urls = set() found_urls.add(starting_url) visited_urls = set() #cn = 1 while len(urls_queue): url = urls_queue.popleft() try: g.go(url) if g.response.code < 400 and g.response.headers[ 'Content-Type'].find( 'text/html') != -1 and g.response.url.startswith( starting_url): #print(str(cn) + '. ' + url) #cn += 1 print(url) links = g.doc.select('//a[@href]') for link in links: href = prep(link.attr('href')) if href == False: continue if href.startswith('http') == False: href = starting_url + '/' + href elif href.startswith('http') and href.startswith( starting_url) == False: continue if href not in found_urls: found_urls.add(href) else: continue if url not in visited_urls: urls_queue.append(href) visited_urls.add(url) elif url in found_urls: found_urls.remove(url) except: pass time.sleep(0.2) return found_urls