class Pep8(object): starting = 'http://pep8online.com' form_action = 'http://pep8online.com/checkresult' def __init__(self): self.g = Grab() self.redis = StrictRedis(**REDIS_CFG) def pep8(self, tested_file='file.py', ignore_codes=None, ignore_type=''): return self.get_and_filter(self.query_file(tested_file), ignore_codes, ignore_type) def pep8s(self, tested_string, ignore_codes=None, ignore_type=''): if ignore_codes is None: ignore_codes = [] return self.get_and_filter(self.query_string(tested_string), ignore_codes, ignore_type) def get_and_filter(self, result, ignore_codes=None, ignore_type=''): if ignore_codes is None: # in fact it is never modified, but to be sure ignore_codes = [] return filter( lambda problem: problem[0][:1] not in ignore_type and problem[0] not in ignore_codes, result) def query_file(self, tested_file): with open(tested_file) as f: content = f.read() return self.query_string(content) def query_string(self, tested_string): redis_key = md5.md5(tested_string).hexdigest() msg = 'Served from ' starting = time.time() redis_value = self.redis.get(redis_key) if redis_value: msg += 'redis' problems = ujson.loads(redis_value) else: msg += 'pep8online.com' self.g.go(self.starting) # except grab.error.GrabTimeoutError gff = self.g.doc.form_fields() gff['code'] = tested_string self.g.go(self.form_action, post=gff) problems = self.parse_response() self.redis.set(redis_key, ujson.dumps(problems)) log_it = self.redis.get('config_log') if log_it is None or log_it == '1': msg += ', %.3f seconds.' % (time.time() - starting) print(msg) return problems def parse_response(self): problems = [] for problem in self.g.doc.tree.cssselect('.tr-result'): tds = problem.findall('td') problems.append( (tds[0].text_content().strip(), tds[1].text.strip(), tds[2].text.strip(), tds[3].text.strip())) return problems
def test_useragent_simple(self): g = Grab(transport=GRAB_TRANSPORT) # Simple case: setup user agent manually g.setup(user_agent='foo') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['user-agent'], 'foo')
def test_cookiefile(self): g = Grab(transport=GRAB_TRANSPORT) # Empty file should not raise Exception open(TMP_FILE, 'w').write('') g.setup(cookiefile=TMP_FILE) g.go(SERVER.BASE_URL) cookies = {'spam': 'ham'} json.dump(cookies, open(TMP_FILE, 'w')) # One cookie are sent in server reponse # Another cookies is passed via the `cookiefile` option SERVER.RESPONSE['cookies'] = {'godzilla': 'monkey'} g.setup(cookiefile=TMP_FILE) g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'spam=ham') # This is correct reslt of combining two cookies MERGED_COOKIES = {'godzilla': 'monkey', 'spam': 'ham'} # g.config should contains merged cookies self.assertEqual(set(MERGED_COOKIES.items()), set(g.config['cookies'].items())) # `cookiefile` file should contains merged cookies self.assertEqual(set(MERGED_COOKIES.items()), set(json.load(open(TMP_FILE)).items()))
def get_source_page(self, search_text): """Getting a source page by given search parameter""" grab = Grab() grab.go(u"https://ya.ru/") grab.doc.set_input(u"text", search_text) grab.doc.submit() return grab.response.body
def get_data(url): ''' Getting data(price and offers href) from Yandex Realt with client parameters ''' #print(url) price_list = [] href_list = [] g = Grab() g.go(url) # search html class with price data_list = g.xpath_list('//*[@class="serp-item__price"]') total = 0 for p in data_list: price = price_format(p.text_content()) total += price price_list.append(price) # search html class with href data_list = g.xpath_list( '//*[@class="link link_redir_yes stat__click i-bem"]') for h in data_list: href_list.append(h.get('href')) if len(price_list) != 0: aver_price = total / len(price_list) return aver_price, href_list else: return 0, []
def month_trips(): # todo: how/where to get this cookie? need to build good userSearchConfiguration row from grab import Grab g = Grab() url = 'https://booking.pobeda.aero/AjaxMonthLowFareAvailaibility.aspx' headers = { 'Cookie': 'ASP.NET_SessionId=21ct5f2osvv5y2bv1ixw3n1f; \ CultureCode=%7b%22Value%22%3a%22ru-RU%22%7d; \ skysales=!Nnf0/LJnRyPnqRB26j6ok5cv2bHz0cdeINHGmEqHIJBCFRJfwAK5n253Fan5AJd7uagOX1WR3QCitE8=; \ PassengersInfoCookie=%7b%22Value%22%3a%22%22%7d; \ dtSa=-; \ userSearchConfiguration=%7B%22From%22%3A%22VKO%22%2C%22InboundDate%22%3A%222017-12-11%22%2C%22To%22%3A%22SGC%22%2C%22OutboundDate%22%3A%222017-12-08%22%2C%22MinADT%22%3A0%2C%22MinCHD%22%3A0%2C%22MinINFT%22%3A0%2C%22SelectedADT%22%3A%221%22%2C%22SelectedCHD%22%3A%220%22%2C%22SelectedINFT%22%3A%220%22%2C%22MaxPax%22%3A0%2C%22TripType%22%3A%22RoundTrip%22%2C%22LinkBooking%22%3Anull%2C%22MinDepartureDate%22%3Anull%2C%22MaxDepartureDate%22%3Anull%2C%22MinArrivalDate%22%3Anull%2C%22MaxArrivalDate%22%3Anull%2C%22Culture%22%3A%22ru%22%2C%22CurrencyCode%22%3A%22RUB%22%2C%22Success%22%3Atrue%2C%22AnyFieldWithData%22%3Afalse%7D; \ dtPC=3$496961232_348h-vCRDIHUPXBDJPBHJGJILFKIAEOODOHLHSOQ; \ dtCookie=3$2B3EAF72CD66D7CD233F16EA2C693C8A|RUM+Default+Application|1; \ rxVisitor=1511890404724BSGLKFH5U7NNCUC0MQG38QMSEJ0S3PV1; \ rxvt=1512498768913|1512496961238; \ dtLatC=2; \ sessionControl=%7B%22ownership%22%3A%7B%22sessionOwnerId%22%3A%22213d9100-e07d-85d5-8342-7bab90906705%22%2C%22sessionOwnerPage%22%3A%22https%3A%2F%2Fbooking.pobeda.aero%2FScheduleSelect.aspx%22%2C%22lastUpdated%22%3A1512497216699%7D%7D', 'Origin': 'https://booking.pobeda.aero', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Referer': 'https://booking.pobeda.aero/ScheduleSelect.aspx', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive' } data = 'indexTrip=1&dateSelected=2017-12-08' g.go(url, headers=headers) # , get=data) g.doc.cookies.items() """
def f(self): g = Grab(log_file='out.log') g.go(BASE_URL) c = CCmbParser() c.soup = g.doc.body g.doc.set_input('search_type', 'ADDRESS') c.soup = g.doc.body a = c.get_subjects() self.fillComboBox(self.cmbSubject, a) g.doc.set_input('subject_id', '130000000000') g.doc.set_input('region_id', '145286000000') g.doc.set_input('settlement_id', '145298578000') #g.doc.set_input('subject_id', '130000000000') #g.doc.set_input('124000000000', 'checked="true"') c.soup = g.doc.body b = c.get_regions() c1 = c.get_street_type() d = c.get_city_types() e = c.get_cities() # Ищем слово "Новости" на странице # print(u"На этой странице есть слово \"Новости\"? %s" % u'Да' if g.doc.text_search(u'Новости') else u'Нет') # выводим тайтл страницы print(u"Заголовок страницы: '%s'" % g.doc.select('//title').text()) g.doc.set_input('search_type', 'ADDRESS') f = g.doc.submit() print 'zi' pass
def download(): url=input('url:') print('ddd') pool = ThreadPool(4) main_page=Grab() main_page.go(url) title=main_page.doc('//*[@id="novel_color"]/p').text() if os.path.isdir('temp'): shutil.rmtree('temp') os.mkdir('temp') path='temp/' main_page.doc.save(path+'main.html') urls_xpath = main_page.doc('//*[@id="novel_color"]/div/dl/dd/a') i=1 urls=[] dict={} for url in urls_xpath: urls.append('http://novel18.syosetu.com'+ url.attr('href')) key= url.attr('href').split('/')[2] dict[key]=url.text().replace('/',' ') print(dict) def pages(page_url): num=page_url.split('/')[4] f = urllib.request.urlopen(page_url) data = f.read() with open(path+'%04u.%s.html'%(int(num),dict[num]), "wb") as code: code.write(data) print(num,dict[num]) pool.map(pages,urls)
def match_history(first_team, second_team, n_last_match=5): """ Функция которая возвращает таблицу из n_last_match последних матчей между командами first_team и second_team """ URI = "http://www.sportzone.ru/sport/search.html?com1={0}&com2={1}&sport=1&rows={2}&search=%CF%EE%E8%F1%EA".format(urllib.quote_plus(first_team.encode('cp1251')), urllib.quote_plus(second_team.encode('cp1251')), n_last_match) g = Grab(log_file="/tmp/graber.log") g.go(URI) if g.search(u'Ранее не встречались'): return [] else: try: all_table = g.xpath('/html/body/table[4]/tr/td[2]/table') except IndexError: return [] answer = [] for y in all_table: a_temp = [] for x in y: if x.text == None: temp = x.xpath('a') if len(temp) > 0 and temp[0].text != None: a_temp.append(temp[0].text) else: a_temp.append(x.text) answer.append(a_temp) return answer
def parse_find(html): g = Grab() g.setup(timeout=30, connect_timeout=20) g.go(html) soup = BeautifulSoup((g.response.body), 'lxml') information = soup.find_all('div', {'class' : 'externalRating-item'}) reload_name = [] for i in list(information): reload_name.append((str(i.text))) list_reit= str(reload_name).split(',') reit = {} reit['IMDb'] = list_reit[1][1:5] if reit == ("']"): reit['IMDb'] = ' Отсутствует' name = str(soup.find('h1', {'class': 'view__title'}).text).replace('\n','').strip().replace('\xa0', ' ') actors = str(soup.find_all('ul', {'class' : 'items'})[0].text).replace(' \n\n',', ') producers = str(soup.find_all('ul', {'class' : 'items'})[1].text).replace('\n\n','') year =soup.find_all('div', {'class' : 'infoi__content'})[1].text year=str(year).replace('\n','').replace(' ','') genre = soup.find_all('div', {'class' : 'infoi__content'})[3].text genre = str(genre).replace(' ','').replace('\n',' ') inf_ab_film = {} inf_ab_film['name'] = name inf_ab_film['genre'] = genre inf_ab_film['age'] =year act = str(producers) + str(actors).replace('\n','') inf_ab_film['actors'] = act inf_ab_film['reit'] = str(reit['IMDb'])[1:] return inf_ab_film
def loadPage(url, i): g = Grab() g.setup(log_dir='grab') g.setup(timeout=150, connect_timeout=100) # g.setup(proxy='186.170.31.134:8080', proxy_type='http') # qs = urlencode({'samo_action':'PRICES', # 'TOWNFROMINC':'101', # 'STATEINC':'33', # 'TOURTYPE':'0', # 'TOURINC':'963', # 'CHECKIN_BEG':'20160721', # 'NIGHTS_FROM':'2', # 'CHECKIN_END':'20160722', # 'NIGHTS_TILL':'10', # 'ADULT':'2', # 'CURRENCY':'1', # 'CHILD':'0', # 'TOWNTO_ANY':'1', # 'TOWNTO':'', # 'STARS_ANY':'1', # 'STARS':'', # 'hotelsearch':'0', # 'HOTELS_ANY':'1', # 'HOTELS':'', # 'MEAL':'', # 'FREIGHT':'0', # 'FILTER':'0', # 'HOTELTYPES':'', # 'PACKET':'1', # 'PRICEPAGE':i}) # g.go(url + qs) g.go(url) body = g.response.body response = body[body.find('<table'):len(body)] return response
def download_manga(link, path, zipping_type): g = Grab() g.go(link, log_file="manga.html") for item in g.doc.select('//div[@class="expandable chapters-link"]//table//a/@href'): chapter = link[:link.rfind('/')] + item._node + '?mature=1' download_chapter(chapter, path, zip=zipping_type)
def main(): print('\n-- Парсинг афиши Тетра оперы и балета - ' + str(datetime.datetime.now())) opera = Grab(document_charset='utf-8', timeout=20, connect_timeout=20) opera.go('http://komiopera.ru/index.php?option=com_content&view=article&id=95&Itemid=134') #opera.response.body = clean_text(opera.response.body, 'normal') dates = opera.doc.select('//table//table//tr/td[1]/div/b') titles = opera.doc.select('//table//table//tr/td[2]/div/b') contents1 = opera.doc.select('//table//table//tr/td[2]/div/i') contents2 = opera.doc.select('//table//table//tr/td[3]/div/b') times = opera.doc.select('//table//table//tr/td[3]/div') date_for_db = data_change(dates[0].text(), 'komiopera') exist_date_event = last_date_event('komiopera', date_for_db) for date, title, content1, content2, time in zip(dates, titles,contents1, contents2, times): if exist_date_event.count(data_change(date.text(), 'komiopera')): print(data_change(date.text(), 'komiopera') + ' уже есть') else: event = { 'name': title.text().strip(), 'date': data_change(date.text(), 'komiopera'), 'time': time.text()[-5:], 'type_event': 'teatr', 'type_film': '', 'price': 0, 'source_id': 6, #коми опера 'description': content1.text().strip() + ', ' + content2.text().strip(), 'poster': '' } write_event_to_db(event)
def parse(): g = Grab() base_url = 'https://www.buzzfeed.com' appendix_1 = '/?p=' topics = ['world', 'politics', 'business', 'lgbt', 'tech', 'science', 'music', 'animals', 'travel', 'style', 'sports'] data = {} for topic in topics: articles_list = [] for page in range(1, 10): time.sleep(0.2) g.go(base_url + '/' + topic + appendix_1 + str(page)) urls = getPageUrls(g.response.body) for url in urls: g.go(base_url + url) article = getArticle(g.response.body) if len(article) > 1: articles_list.append(article) data.update({topic: articles_list}) data_size = 0 for topic in data.keys(): data_size += len(data[topic]) print "{} articles in {} topics".format(data_size, len(data))
def proxy_validation(proxy: str, ptype: str, web_site: str = 'https://whoer.net/') -> bool: """ Проверка живучести прокси. :param proxy: IP:PORT прокси :param ptype: Тип прокси :param web_site: сайт который будет загружен для проверки соединения :return: """ if proxy is None: return True try: from grab import Grab from grab import GrabError except ImportError: print('For validation proxy needed Grab!') return True g = Grab() g.setup(proxy=proxy, proxy_type=ptype, connect_timeout=5, timeout=60) try: print('Starting check proxy url={!r}'.format(web_site)) g.go(web_site) except GrabError: print('Proxy {!r} is dead'.format(proxy)) return False else: print('Proxy {!r} is live'.format(proxy)) return True
def add_advert(): print("Add new advertisement.") g = Grab(log_file="2.html") g.load_cookies('cookies.txt') g.go("http://m.avito.ru/add") #login_test() from selenium.webdriver import Firefox from selenium.webdriver.common.keys import Keys import selenium from PIL import Image browser = Firefox() driver = selenium.webdriver.Firefox() browser.get('http://m.avito.ru/profile/login') driver.implicitly_wait(10) elem = driver.find_element_by_css_selector(".control-self.control-self-email") elem.send_keys("*****@*****.**") """ driver.find_element_by_name("password") element.send_keys("ivveqaem") driver.find_element_by_class_name("control-self control-self-submit button button-solid button-blue button-large") driver.find_element_by_partial_link_text("Войти") element.send_keys(Keys.ENTER) """ #browser.get('http://m.avito.ru/add') browser.save_screenshot('current_page') current_page_img = Image.open('current_page') w, h = current_page_img.size captcha_img = current_page_img#.crop((575, 505, w-155, h-1820)) captcha_img.save('captcha.jpg', 'jpeg')
def scan_it_remote(request): if request.user.is_admin: siteremote = "http://hotline.gowius.com/addtask/" site = '20k.com.ua' id = int(request.POST.get('id', 0)) category = Category.objects.filter(pk=id) if category: cat = category.get() items = ColorProduct.objects.filter(product__category=cat) result = [] # scan = ScanHotline(category=cat, items=items) # scan.save() for item in items: if item.href and item.price > 0 and item.hrefok: result.append({'id': item.id, 'url': item.href}) g = Grab() g.setup(post={'url': site, 'items': json.dumps(result)}) g.go(siteremote) return HttpResponse('ok')
def get_mag(link): idmag = link.split('/') # assert False, idmag firm = FirmHotline.objects.filter(itemid=int(idmag[2])) if firm: self.stdout.write("Firm exists:" + firm[0].name) return firm[0] else: fg = Grab(log_dir="/tmp", timeout=30) fg.go('http://hotline.ua' + link) body = fg.response.unicode_body() pyquery = pq(body) name = pyquery('ul.shop-title > li > h1').text() try: link = pyquery( 'ul.info-shops > li > p > a')[0].attrib['href'] except: link = "" firm = FirmHotline(itemid=int(idmag[2]), name=name, url=link) firm.save() self.stdout.write("New Firm:" + firm.name) return firm
def get_weather(message): """.""" cities = {'dnepr': 'dnipropetrovsk-5077', 'kyiv': 'kyiv-4944', 'lviv': 'lviv-4949'} g = Grab() try: (city, date) = message.split('-') except ValueError: return 'Wrong data' try: g.go('https://www.gismeteo.ua/weather-{city}/'.format( city=cities[city])) except KeyError: return 'Wrong city' select_now = g.doc.select('//div[@id="weather"]' '//div[contains(@class, "temp")]' '//dd[contains(@class, "c")]').text() select_tommorow = ''.join([ g.doc.select('//div[@id="tab_wdaily2"]' '//div[contains(@class, "temp")]' '//span[contains(@class, "c")]').text(), ' - ', g.doc.select('//div[@id="tab_wdaily2"]' '/div[contains(@class, "temp")]' '/em/span[contains(@class, "c")]').text()]) temperature = select_tommorow if date == '2' else select_now return temperature.encode('utf-8')
def main(): added = 0 limit = 100 count = 1 stack = [base_url] doc_links = open("Links.txt", "w") for site in stack: stack_link = site print(str(added) + " - " + stack_link) g = Grab(log_file='html.html') g.go(stack_link) time.sleep(0.1) doc_file = open("docs/%i.txt" % added, "w") doc_file.write(g.doc.select("//*").text()) doc_file.close() doc_links.write(str(added) + " - " + stack_link + '\n') added += 1 if count == limit: continue for link in g.doc.select('//a'): try: href = base_url + link.attr("href") except DataNotFound: continue if count == limit: break if stack.count(href) == 0 and is_link_valid(href): stack.append(href) count += 1 doc_links.close()
def param(): from grab import Grab KOL = 1 #КОЛИЧЕСТВО ЦИКЛОВ СКРИПТА for m in range(KOL): g = Grab() g.go('http://192.168.100.6:10002/login/WebVision/ses_Fithness/') g.set_input('user', 'root') g.set_input('pass', 'GfhjkmKf;f123') g.submit() g.go( 'http://192.168.100.6:10002/WebVision/ses_Fithness4/pg_so/pg_6/pg_mn/pg_1?com=attrsBr&tm' ) name1, name2, full, mass1 = [], [], [], [] for elem in g.doc.select('//w[*]/el[@id="text"]'): name1.append(elem.text()) #Холодилка название параметров for elem in g.doc.select('//w[*]/el[@id="arg0val"]'): mass1.append(elem.text()) #Холодилка значения global full2 name1 = [x for x in name1 if x != '%1'] #Удаляем ненужные значения name1 = (name1[2], name1[3], name1[8], name1[9], name1[29], name1[49], name1[32], name1[33], name1[4], name1[31], name1[60] ) #Собираем названия mass1 = (mass1[0], mass1[1], mass1[4], mass1[5], mass1[9], mass1[30], mass1[10], mass1[11], mass1[3], mass1[12], mass1[39] ) # Собираем значения full = [a + " = " + str(b) for a, b in zip(name1, mass1)] #В список 2 списка return full
def get_links(page, grab_=None): if grab_ is None: grab_ = Grab() grab_.go(page) return [ 'http://tagbrand.com%s' % link.attr('href') for link in grab_.doc.select('//dl[*]/dd/p[1]/a')]
class UltimateRewardsGrabber: def __init__(self): self.g = Grab() def grab(self): self.g.go(BASE_URL) divs = self.g.doc.select('//div[contains(@class, "mn_srchListSection")]') for div in divs: try: merchants = div.text().split('/$') for merchant in merchants: merchant = merchant.split('Details ')[1] title = ' '.join(merchant.split(' ')[:-2]) cost = merchant.split(' ')[-2] print title, ' - ', cost except IndexError: pass merchant = models.Item(title=title, cost=cost) db.session.add(merchant) db.session.commit() def save(self): pass
def transya(): # Перевод на русский язык. Спасибо яндекс переводчику ! :) g_translate_img_alt = Grab() g_translate_img_title = Grab() g_translate_description = Grab() count = 0 max_index = len(Topics.href) for count in range(9): # Переводим описание изображения resp_front_img_alt_ru = json.loads(g_translate_img_alt.go( 'https://translate.yandex.net/api/v1.5/tr.json/translate?key=trnsl.1.1.20170514T220842Z.5b2c14ecd7990670.3ccb355751262f1359f3c3ff0b9b7d5447ce39a1', post={"text": Topics.curi_tr_get_topic_front_img_alt()[count], 'lang': 'en-ru', 'format': 'plain'}).unicode_body(ignore_errors=True, fix_special_entities=True)) Topics.front_img_alt_ru.append(str(resp_front_img_alt_ru)) # Переводим титул постов resp_title_ru = json.loads(g_translate_img_title.go( 'https://translate.yandex.net/api/v1.5/tr.json/translate?key=trnsl.1.1.20170514T220842Z.5b2c14ecd7990670.3ccb355751262f1359f3c3ff0b9b7d5447ce39a1', post={"text": Topics.curi_tr_get_topic_title()[count], 'lang': 'en-ru', 'format': 'plain'}).unicode_body( ignore_errors=True, fix_special_entities=True)) Topics.title_ru.append(str(resp_title_ru["text"])) # Переводим расширенное описание поста resp_descriptions_ru = json.loads(g_translate_img_alt.go( 'https://translate.yandex.net/api/v1.5/tr.json/translate?key=trnsl.1.1.20170514T220842Z.5b2c14ecd7990670.3ccb355751262f1359f3c3ff0b9b7d5447ce39a1', post={"text": Topics.get_descriptions()[count], 'lang': 'en-ru', 'format': 'plain'}).unicode_body( ignore_errors=True, fix_special_entities=True)) Topics.descriptions_ru.append(str(resp_descriptions_ru["text"])) count = count + 1
def SaveImageYandex(text, imageCount, path, w='800', h='600'): global prefix prefix += 1 g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language': 'ru,en;q=0.8'}) query = urllib.urlencode({'text': text.encode('utf-8'), 'iw': w, 'ih': h}) url = 'http://images.yandex.ru/yandsearch?isize=gt&itype=jpg&'+query g.go(url) image_number = 0 f2 = open('out.txt', 'a') filename = str(prefix) + '-' + StringForFilename(text) + '.jpg' f2.write(filename + '\n') f2.close() while image_number < imageCount: image_number += 1 tmp = g.doc.select('//html/body/div[2]/div/div[2]/div[2]/div[1]/div[contains(@class, "b-images-item")][' + str(image_number) + ']').attr('onclick') match = re.search(r'"fullscreen":\{"url":"(.*?)"', tmp) if match: image_URL = match.group(1) print str(image_number) + '. ' + image_URL ext = GetFileExtFromURL(image_URL) filename = str(prefix) + '-' + StringForFilename(text) + '-' + str(image_number) + '.jpg' try: patht = os.path.join(path, filename) print patht urllib.urlretrieve(image_URL, patht) except: pass else: print 'Cant find image for this query ' + str(image_number)
def just_print(mark): g = Grab() g.go(mark.url) body = g.doc.tree title = body.xpath('//*/head/title/text()') description = body.xpath('//*/meta[@name="description"]/@content') if title == []: title = u'Странно, но заголовок отстутствует' else: title = title[0] if description == []: description = body.xpath('//*/meta[@property="og:description"]/@content') if description == []: description = u'Описание отсутствует' else: description = description[0][0:200] else: description = description[0][0:200] p = re.compile("(.*\.\w{2,3})/") res = p.findall(mark.url)[0] favicon = res+'/favicon.ico' print('message from task') mark.title = title mark.description = description mark.favicon = favicon mark.save()
def prepare_and_create_grab(url): cache_name = split_url_by_volume_and_chapter(url) dir_name = cache_name[0] file_name = cache_name[1] + '.html' file_path = os.path.join(generate_info_ranobe.DIR_RANOBE, 'cache', dir_name, file_name) data = None if not os.path.exists(os.path.dirname(file_path)): os.makedirs(os.path.dirname(file_path)) if not os.path.exists(file_path): g = Grab() g.go(url) with open(file_path, mode='w', encoding='utf8') as f: text = g.response.body f.write(text) if not data: data = text if not data: with open(file_path, encoding='utf8') as f: data = f.read() return Grab(data)
def get_mag(link): idmag = link.split('/') firm = FirmHotline.objects.filter(itemid=int(idmag[2])) if firm: print("Firm exists:" + firm[0].name) return firm[0] else: fg = Grab(log_dir="/tmp", timeout=300) fg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) fg.go('http://hotline.ua' + link) body = fg.response.body pyquery = pq(body) name = pyquery('ul.shop-title > li > h1').text() try: link = pyquery('ul.info-shops > li > p > a')[0].attrib['href'] except: link = "" firm = FirmHotline(itemid=int(idmag[2]), name=name, url=link) firm.save() print("New Firm:" + firm.name) return firm
def rozparse(self): g = Grab() names = [] prices = [] count = 1 paginator = [] res = {} g.go('http://rozetka.com.ua/stabilizers/c144719/') for i in g.doc.select('//ul[@name="paginator"]/li[@class="paginator-catalog-l-i"]/a'): paginator.append(i.text()) while count < (int(paginator[-1])+1): g.go('http://rozetka.com.ua/stabilizers/c144719/page=' + str(count) + '/') for title in g.doc.select('//div[@class="g-i-tile-i-box-desc"]/div[@class="g-i-tile-i-title clearfix"]'): names.append(title.text()) for i in g.doc.select('//div[@class="g-i-tile-i-box-desc"]'): prices.append(json.loads(parse.unquote(i.text().split('"')[1]))['price']) print(count) count += 1 for key, value in zip(names, prices): res[key] = value print(res['Электромир Volter СНПТО 18пт']) return print(res)
def test_body_maxsize(self): g = Grab(transport=GRAB_TRANSPORT) g.setup(body_maxsize=100) SERVER.RESPONSE['get'] = 'x' * 1024 * 1024 g.go(SERVER.BASE_URL) # Should be less 50kb self.assertTrue(len(g.response.body) < 50000)
def get_course_gold(): url = "https://pwcats.info/servers/scorpio" g = Grab() g.go( url, user_agent= 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/62.0.3202.94 ' 'YaBrowser/17.11.1.990 Yowser/2.5 Safari/537.36') pay_list = g.doc.select( '/html/body/div[1]/div/div/div[2]/aside/table[1]/tbody/tr[*]/td[1]/text()' ).node_list() sale_list = g.doc.select( '/html/body/div[1]/div/div/div[2]/aside/table[1]/tbody/tr[*]/td[2]/text()' ).node_list() # print(pay_list[0]) # print(pay_list[0].replace(' ', '')) for i in range(0, pay_list.__len__()): string = pay_list[i].replace(' ', '') string = string.replace('\n', '') index = string.find('(') pay_list[i] = string[0:index] for i in range(0, sale_list.__len__()): str_sale = sale_list[i].replace(' ', '') str_sale = str_sale.replace('\n', '') index = str_sale.find('(') sale_list[i] = str_sale[0:index] return "Продают по " + min(pay_list) + '\nCкупают по ' + max(sale_list)
def login_test(): g = Grab(log_file="1.html") g.go("http://m.avito.ru/profile") g.doc.set_input("login","login") g.doc.set_input("password","password") g.doc.submit() g.cookies.save_to_file('cookies.txt')
def start(): CSVFile(header=['Artist', 'Album', 'Genre', 'Style', 'Year', 'Rating']) page = 1 page_not_found = None while page_not_found == None: try: print('Page', page) pitchfork_page = Grab() pitchfork_page.go(PITC_URL + str(page)) soup = Soup(pitchfork_page.doc.select('//div[@id="main"]/ul[@class="object-grid "]').html(), 'lxml') albums_on_page = [] for link in soup.find_all('a', href=True): albums_on_page.append('http://pitchfork.com' + link['href']) pool = ThreadPool(THREADS) pool.map(pitchfork, albums_on_page) page += 1 # if page > 1: # page_not_found = True except IndexError as error: print(error) page_not_found = True
def parse(last_page=1): i = 1 print('Last page is {0}'.format(last_page)) for x in range(1, last_page + 1): main_domain = 'http://4pda.ru/page/{0}/'.format(x) g = Grab() g.go(main_domain) nodes = g.doc.select('//article[@class="post"]').node_list() if nodes: try: f = open('4pda.csv', 'x') writer = csv.writer(f) writer.writerow(['№', 'Заголовок', 'Дата публикации', 'Ссылка']) except FileExistsError: f = open('4pda.csv', 'a') writer = csv.writer(f) finally: for n, node in enumerate(nodes): header = node.xpath('//div[@class="description"]//h1//span') links = node.xpath('//div[@class="description"]//h1//a') dates = node.xpath('//div//div//div//em') writer.writerow([ i, header[n].text, dates[n].text, links[n].attrib['href'] ]) i += 1 f.close() print(x) else: return 'Posts not found' return 'Job done.'
def getModelLink(modelName): g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language': 'ru,en;q=0.8'}) url = 'http://market.yandex.ru/' g.go(url) try: paginatorHTML = g.doc.select(popt['pagination']).html() pagesLinks = GetAllLinksFromString(paginatorHTML, url) except: pagesLinks = [] pagesLinks.append(url) pagesLinks = list(set(pagesLinks)) pagesCount = pagesLinks.__len__() newPagesCount = 1 while pagesCount != newPagesCount: lastPage = pagesLinks.__len__() - 1 url = pagesLinks[lastPage] g.go(url) try: paginatorHTML = g.doc.select(popt['pagination']).html() newlinks = GetAllLinksFromString(paginatorHTML, url) except: newlinks = [] for newlink in newlinks: pagesLinks.append(newlink) pagesLinks = list(set(pagesLinks)) newPagesCount = pagesLinks.__len__() return pagesLinks
def get_every_day(): global caption global date_post url = "https://pp.userapi.com/" g = Grab() g.go( "https://vk.com/skorpw", user_agent= 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 ' 'YaBrowser/17.11.1.990 Yowser/2.5 Safari/537.36') # list = g.doc.body.decode('cp1251') try: image = g.doc.select( './/*[@id="public_wall"]/*[@id="page_wall_posts"]/div/div/div[2]/div[1]/div[1]/div[1]/div[2]/a[@aria-label]/@onclick' )[0].text() caption = 'Ежа' date_time = datetime.datetime.now() date_post = date_time.date() json_string = get_indexes(image) res = json.loads(json_string) result = res['temp']['y'] url_image = result #url_image=result[0] #url_image="http://www.kartinki.me/pic/201506/1920x1200/kartinki.me-21699.jpg" return url_image except IndexError: return None
def process_news_list_page(self, url): """Обработка страницы со ссылками на новости""" print('Trying open url: {}'.format(url)) g = Grab(proxy='127.0.0.1:{}'.format(self.socks_port), proxy_type='socks5', timeout=90, connect_timeout=30) try: g.go(url) except GrabNetworkError as e: # Ошибка подключения print('Connection error: {}'.format(e)) self.change_identity() # Используем новую личность return url # На всякий случай проверим код ответа if g.response.code != 200: print('Error code: {}'.format(g.response.code)) self.change_identity() return url # Проверка, не наткнулись ли мы на капчу captcha = g.doc.select(self.captcha_xpath).text_list() if captcha: print('Captcha found: {}, setting new identity'.format(captcha[0])) self.change_identity() # Используем новую личность return url # Поиск ссылок на новости news_links = g.doc.select(self.links_xpath).text_list() for news_link in news_links: founded_url = news_link.replace( re.findall('.+//[^/]+', news_link)[0], '') # Потому что для загрузки надо передавать урл без домена pattern = self.config.get('detailed_page_re') match = re.search( pattern, founded_url ) # Дополнительная проверка на совпадение полученного урла регулярке if match: print('Found link: {}, from {}'.format(founded_url, self.url)) # Передаем в очередь для загрузки retrieve_information.apply_async(args=(self.config, founded_url), kwargs={ 'related_from_url': None, 'region': self.region_name }, priority=251) # Поиск ссылки для перехода на следующую страницу next_page = g.doc.select(self.next_page_xpath).text_list() if next_page: if self.requests_delay is not None: sleep( self.requests_delay) # Чтобы не быть слишком настойчивыми # Переход на следующую страницу return '{}{}'.format(self.base_url, next_page[0]) # Не было ошибки, и не было найдено ссылки на очередную страницу return False
def parse_data_from_url(report_id): logging = get_task_logger('parse') locale.setlocale( locale.LC_TIME, 'ru_RU.UTF-8' ) #Пока не проверял, как работает с локалью сам Джанго, потом протестирую report = Report.objects.get(id=report_id) report.pages_amount = get_pages_amount(report_id) report.title = get_title(report_id) weekly = json.loads(report.weekly) hourly = json.loads(report.hourly) for page in range(1, report.pages_amount + 1): url = report.assemble_url() + '?page={page}'.format(page=page) g = Grab(log_file='page_out.html') try: g.go(url) except GrabTimeoutError: continue page_urls = g.doc.select( '//*[@id="offers_table"]//*[@data-cy="listing-ad-title"]/@href' ).text_list() for page_url in page_urls: dt = get_page_datetime(page_url) if dt: hourly[str(dt.hour)] += 1 weekly[dt.strftime('%A')] += 1 report.hourly = json.dumps(hourly) report.weekly = json.dumps(weekly) report.save() report.send_mail()
def check_following(self, url, token_id, loyalty_id): follow = False self.refresh_token(token_id) soc_token = SocToken.query.get(token_id) action = PaymentLoyalty.query.get(loyalty_id) target = json.loads(action.data) g = Grab() g.setup(headers={'Authorization': 'Bearer ' + soc_token.user_token}) url_api = self.API_PATH + self.API_PARTS['subscriptions'] while not follow: g.go(url_api) subscriptions = json.loads(g.response.body) if 'items' not in subscriptions: break if len(subscriptions['items']) <= 0: break for subscribe in subscriptions['items']: if 'snippet' in subscribe and 'channelId' in subscribe['snippet'] and subscribe['snippet']['channelId'] == target['channelId']: follow = True if 'nextPageToken' not in subscriptions: break if len(subscriptions['nextPageToken']) <= 0: break url_api = "%s%s&pageToken=%s" % ( self.API_PATH, self.API_PARTS['subscriptions'], subscriptions['nextPageToken']) return follow
def main(): default_logging() for x in xrange(500): url = 'http://load.local/grab.html' g = Grab() g.go(url) assert 'grab' in g.response.body
def parse_famous(year, month, day): ''' parse famous from famousbirthdays.com by month day year is ignore now ''' months = get_months() url = 'http://www.famousbirthdays.com/%s%d.html' % (months[month], day) g = Grab() g.setup() g.go(url) elements = g.doc.select('//ul[@class="top-celebrity-col4 col1"]/li') list = [] for element in elements: src = element.node.getchildren()[1].getchildren()[0].getchildren()[0].get('src') age = element.node.getchildren()[2].getchildren()[0].text_content().split(' ')[-1] name = element.node.getchildren()[2].getchildren()[0].getchildren()[0].text_content() description = element.node.getchildren()[2].getchildren()[1].text_content() list.append({'src': src, 'name': name, 'age': age, 'description': description}) return list
def test_empty_useragent_pycurl(self): g = Grab(transport=GRAB_TRANSPORT) # Empty string disable default pycurl user-agent g.setup(user_agent='') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers'].get('user-agent', ''), '')
def get_data(url): ''' Getting data(price and offers href) from Yandex Realt with client parameters ''' #print(url) price_list = [] href_list = [] g = Grab() g.go(url) # search html class with price data_list = g.xpath_list('//*[@class="serp-item__price"]') total = 0 for p in data_list: price = price_format(p.text_content()) total += price price_list.append(price) # search html class with href data_list = g.xpath_list('//*[@class="link link_redir_yes stat__click i-bem"]') for h in data_list: href_list.append(h.get('href')) if len(price_list) != 0: aver_price = total / len(price_list) return aver_price, href_list else: return 0, []
def clean_url(self): url = self.cleaned_data['url'] url_regex = 'https?:\/\/diesel.elcat.kg\/index.php\?showtopic=([\d]+).*' compiled_url_regex = re.compile(url_regex) urls = compiled_url_regex.findall(url) if urls: real_url = 'http://diesel.elcat.kg/index.php?showtopic=' + urls[0] g = Grab() g.setup(connect_timeout=30, timeout=60) try: g.go(url=real_url) except: raise forms.ValidationError(u'Что то пошло не так.') html = g.response.body mes_regex = '<div class="([\w]+)" id=\\\'([\w]+)-([\d]+)\\\'>' compiled_mes_regex = re.compile(mes_regex) topics = compiled_mes_regex.findall(html) if not topics: raise forms.ValidationError(u'Топик не существует или удален.') return real_url else: raise forms.ValidationError(u'Не коректный url!')
def test_nobody(self): g = Grab(transport=GRAB_TRANSPORT) g.setup(nobody=True) SERVER.RESPONSE['get'] = 'foo' g.go(SERVER.BASE_URL) self.assertEqual('', g.response.body) self.assertTrue(len(g.response.head) > 0)
def run(self): global rining global success global missedCount if rining: g = Grab() g.setup(hammer_mode=True, hammer_timeouts=((10, 15), (20, 30), (60, 80))) #g.load_proxylist('proxy.lst', 'text_file', proxy_type='http', auto_init=False, auto_change=True) try: g.go(uri) except Exception: print "\n[!] No valid proxy or network error ...\n" rining = 0 sys.exit(1) for i in range(len(words)): if rining and not success: sleep(1) nextword = getword() value = last + nextword try: self.bot(value, nextword, g) except Exception: print "\n[!] Network error ...\n" rining = 0 sys.exit(1) rining = 0
def loadPage(url, adult, child, country, i): print 'run Grab' g = Grab() g.setup(log_dir='tcc_tayland_2_1') g.setup(timeout=250, connect_timeout=200) g.setup(proxy='220.101.93.3:3128', proxy_type='http') qs = urlencode({'samo_action':'PRICES', 'TOWNFROMINC':'101', 'STATEINC':country_op, 'TOURTYPE':'0', 'TOURINC':'0', 'CHECKIN_BEG':'20160731', 'NIGHTS_FROM':'2', 'CHECKIN_END':'20160831', 'NIGHTS_TILL':'10', 'ADULT':adult, 'CURRENCY':'2', 'CHILD':child, 'TOWNTO_ANY':'1', 'TOWNTO':'', 'STARS_ANY':'1', 'STARS':'', 'hotelsearch':'0', 'HOTELS_ANY':'1', 'HOTELS':'', 'MEAL':'', 'FREIGHT':'0', 'FILTER':'0', 'HOTELTYPES':'', 'PACKET':'1', 'PRICEPAGE':i}) print (url + qs) g.go(url + qs) body = g.response.body return body
def test_cookiefile(self): g = Grab(transport=GRAB_TRANSPORT) # Empty file should not raise Exception open(TMP_FILE, 'w').write('') g.setup(cookiefile=TMP_FILE) g.go(SERVER.BASE_URL) cookies = [{'name': 'spam', 'value': 'ham'}] json.dump(cookies, open(TMP_FILE, 'w')) # One cookie are sent in server reponse # Another cookies is passed via the `cookiefile` option SERVER.RESPONSE['cookies'] = {'godzilla': 'monkey'} g.setup(cookiefile=TMP_FILE) g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['cookies']['spam'].value, 'ham') # This is correct reslt of combining two cookies MERGED_COOKIES = [('godzilla', 'monkey'), ('spam', 'ham')] # g.cookies should contains merged cookies self.assertEqual(set(MERGED_COOKIES), set(g.cookies.items())) # `cookiefile` file should contains merged cookies self.assertEqual(set(MERGED_COOKIES), set((x['name'], x['value']) for x in json.load(open(TMP_FILE))))
def test_khest(): ua = UserAgent() grab = Grab(timeout=30, log_file='%s/vparser/tmp/pars/log.html' % os.path.split(PROJECT_PATH)[0]) grab.setup(proxy='46.148.30.250:8080', proxy_type='http', proxy_userpwd=CREDENTIALS) # , log_dir='vparser/tmp' grab.go( 'http://kharkov.kha.slando.ua/obyavlenie/sdam-gostinku-tsentr-vse-udobstva-ID75tep.html#13fed9ae6e;promoted' ) # grab.go('http://kharkov.kha.slando.ua/nedvizhimost/arenda-kvartir/') # ff = grab.doc.select('//div[@class="pricelabel tcenter"]') # print ff.text() # for f in get_adv_on_page(grab): # print f # g.setup(cookies={u'domain': u'secure.e-konsulat.gov.pl', u'name': # u'MSZ', u'value': u'64e8734b-986c-4cd4-be44-b2c112ec49c8', u'expiry': # '1362046140', u'path': u'/', u'secure': 'False'}) # print get_adv_photo(grab) # for dd in grab.doc.select('//div[@class="pding5_10"]'): # if dd.text().split(':')[0] == u'Количество комнат': # print dd.text().split(':')[1] phones = get_phone(grab) # print len(phones) slando = Slandos() # for phone in phones: # print phone moder_phone(phones, slando)
def test_load_dump(self): g = Grab(transport=GRAB_TRANSPORT) cookies = {'foo': 'bar', 'spam': 'ham'} g.setup(cookies=cookies) g.go(SERVER.BASE_URL) g.dump_cookies(TMP_FILE) self.assertEqual(set(cookies.items()), set((x['name'], x['value']) for x in json.load(open(TMP_FILE)))) # Test non-ascii g = Grab(transport=GRAB_TRANSPORT) cookies = {'foo': 'bar', 'spam': u'бегемот'} g.setup(cookies=cookies) g.go(SERVER.BASE_URL) g.dump_cookies(TMP_FILE) self.assertEqual(set(cookies.items()), set((x['name'], x['value']) for x in json.load(open(TMP_FILE)))) # Test load cookies g = Grab(transport=GRAB_TRANSPORT) cookies = [{'name': 'foo', 'value': 'bar'}, {'name': 'spam', 'value': u'бегемот'}] json.dump(cookies, open(TMP_FILE, 'w')) g.load_cookies(TMP_FILE) self.assertEqual(set(g.cookies.items()), set((x['name'], x['value']) for x in cookies))
def translate(word, key, lan1='en', lan2='ru', alt=True, syn=True): """Prints the number of counts, word, translation, and example from lan1 to lan2 according to Translate.Google.""" # First, write down a translation in some auxiliary txt file # and load it in json format g = Grab(log_file = 'dict.txt') link = 'http://translate.google.ru/translate_a/t?client=x&text='\ + word + '&sl=' + lan1 + '&tl=' + lan2 g.go(link) data = json.load(open('dict.txt')) # Then, let's try to get all the necessary elements in json translation, noun, alternatives, synonims = 0, 0, 0, 0 try: translation = data[u'sentences'][0][u'trans'] noun = data[u'dict'][0][u'pos'] alternatives = data['dict'][0]['terms'] synonims = data['dict'][0]['entry'][0]['reverse_translation'] except: pass # German nouns should begin with capital letter if lan1=='de' and noun==u'имя существительное': word = word.title() # Finally, print out counts, word, translation with alternatives # and synonims, if applicable. Encoding is added up to allow # printing in cmd if you have a russian version of Windows if translation: print ('['+str(key)+']', word, ': ', translation) if alt and alternatives: [print (i, end=', ') for i in alternatives] print ('\r') if syn and synonims: [print (i.encode('cp866', errors='replace'), end=', ') for i in synonims] print ('\n')
def getproduct(href, item, scan, oneposition): pg = Grab(log_dir="/tmp", timeout=30) # proxy = Proxy.objects.filter(active=True).order_by('?')[0] # # proxyaddrlist = proxy.name.split(':')[0:1] # # proxyuserlist = proxy.name.split(':')[2:3] # proxyaddr = proxy.name.split(':')[0] + ':' + proxy.name.split(':')[1] # proxyuser = proxy.name.split(':')[2] + ':' + proxy.name.split(':')[3] # print proxyaddr # # assert False, proxyuser # pg.setup(proxy=proxyaddr, proxy_userpwd=proxyuser, proxy_type="http") pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) # print pg.config['proxy'] # pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt", source_type='text_file', proxy_type='http', auto_change=True) try: purl = "http://hotline.ua" + href pg.go(purl) # pass except Exception, e: print "Error: " + purl return