def parse(): g = Grab() base_url = 'https://www.buzzfeed.com' appendix_1 = '/?p=' topics = ['world', 'politics', 'business', 'lgbt', 'tech', 'science', 'music', 'animals', 'travel', 'style', 'sports'] data = {} for topic in topics: articles_list = [] for page in range(1, 10): time.sleep(0.2) g.go(base_url + '/' + topic + appendix_1 + str(page)) urls = getPageUrls(g.response.body) for url in urls: g.go(base_url + url) article = getArticle(g.response.body) if len(article) > 1: articles_list.append(article) data.update({topic: articles_list}) data_size = 0 for topic in data.keys(): data_size += len(data[topic]) print "{} articles in {} topics".format(data_size, len(data))
def login_test(): g = Grab(log_file="1.html") g.go("http://m.avito.ru/profile") g.doc.set_input("login","login") g.doc.set_input("password","password") g.doc.submit() g.cookies.save_to_file('cookies.txt')
def get_source_page(self, search_text): """Getting a source page by given search parameter""" grab = Grab() grab.go(u"https://ya.ru/") grab.doc.set_input(u"text", search_text) grab.doc.submit() return grab.response.body
def test_useragent_simple(self): g = Grab(transport=GRAB_TRANSPORT) # Simple case: setup user agent manually g.setup(user_agent='foo') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['user-agent'], 'foo')
def forge_request(self, **kwargs): g = Grab() data = { 'user': self.user, 'pass': self.password, } url = "%ssms.cgi" % SMS_TWO_PREFIX if 'url' in kwargs: url = '%s%s' % (SMS_TWO_PREFIX, kwargs['url']) del kwargs['url'] if 'frm' in kwargs: data.update({'from': kwargs['frm']}) del kwargs['frm'] data.update(kwargs) post = urllib.urlencode(data) #request = urllib2.Request(url, post) #grab implementation g.setup(post=data) try: self._response = g.go(url) except GrabError, e: self.http_error = { 'code': e[0], 'content': e[1] }
def get_links(page, grab_=None): if grab_ is None: grab_ = Grab() grab_.go(page) return [ 'http://tagbrand.com%s' % link.attr('href') for link in grab_.doc.select('//dl[*]/dd/p[1]/a')]
def parse_famous(year, month, day): ''' parse famous from famousbirthdays.com by month day year is ignore now ''' months = get_months() url = 'http://www.famousbirthdays.com/%s%d.html' % (months[month], day) g = Grab() g.setup() g.go(url) elements = g.doc.select('//ul[@class="top-celebrity-col4 col1"]/li') list = [] for element in elements: src = element.node.getchildren()[1].getchildren()[0].getchildren()[0].get('src') age = element.node.getchildren()[2].getchildren()[0].text_content().split(' ')[-1] name = element.node.getchildren()[2].getchildren()[0].getchildren()[0].text_content() description = element.node.getchildren()[2].getchildren()[1].text_content() list.append({'src': src, 'name': name, 'age': age, 'description': description}) return list
def parse(last_page=1): i = 1 print('Last page is {0}'.format(last_page)) for x in range(1, last_page + 1): main_domain = 'http://4pda.ru/page/{0}/'.format(x) g = Grab() g.go(main_domain) nodes = g.doc.select('//article[@class="post"]').node_list() if nodes: try: f = open('4pda.csv', 'x') writer = csv.writer(f) writer.writerow(['№', 'Заголовок', 'Дата публикации', 'Ссылка']) except FileExistsError: f = open('4pda.csv', 'a') writer = csv.writer(f) finally: for n, node in enumerate(nodes): header = node.xpath('//div[@class="description"]//h1//span') links = node.xpath('//div[@class="description"]//h1//a') dates = node.xpath('//div//div//div//em') writer.writerow([ i, header[n].text, dates[n].text, links[n].attrib['href'] ]) i += 1 f.close() print(x) else: return 'Posts not found' return 'Job done.'
def getModelLink(modelName): g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language': 'ru,en;q=0.8'}) url = 'http://market.yandex.ru/' g.go(url) try: paginatorHTML = g.doc.select(popt['pagination']).html() pagesLinks = GetAllLinksFromString(paginatorHTML, url) except: pagesLinks = [] pagesLinks.append(url) pagesLinks = list(set(pagesLinks)) pagesCount = pagesLinks.__len__() newPagesCount = 1 while pagesCount != newPagesCount: lastPage = pagesLinks.__len__() - 1 url = pagesLinks[lastPage] g.go(url) try: paginatorHTML = g.doc.select(popt['pagination']).html() newlinks = GetAllLinksFromString(paginatorHTML, url) except: newlinks = [] for newlink in newlinks: pagesLinks.append(newlink) pagesLinks = list(set(pagesLinks)) newPagesCount = pagesLinks.__len__() return pagesLinks
def check_following(self, url, token_id, loyalty_id): follow = False self.refresh_token(token_id) soc_token = SocToken.query.get(token_id) action = PaymentLoyalty.query.get(loyalty_id) target = json.loads(action.data) g = Grab() g.setup(headers={'Authorization': 'Bearer ' + soc_token.user_token}) url_api = self.API_PATH + self.API_PARTS['subscriptions'] while not follow: g.go(url_api) subscriptions = json.loads(g.response.body) if 'items' not in subscriptions: break if len(subscriptions['items']) <= 0: break for subscribe in subscriptions['items']: if 'snippet' in subscribe and 'channelId' in subscribe['snippet'] and subscribe['snippet']['channelId'] == target['channelId']: follow = True if 'nextPageToken' not in subscriptions: break if len(subscriptions['nextPageToken']) <= 0: break url_api = "%s%s&pageToken=%s" % ( self.API_PATH, self.API_PARTS['subscriptions'], subscriptions['nextPageToken']) return follow
def start(): CSVFile(header=['Artist', 'Album', 'Genre', 'Style', 'Year', 'Rating']) page = 1 page_not_found = None while page_not_found == None: try: print('Page', page) pitchfork_page = Grab() pitchfork_page.go(PITC_URL + str(page)) soup = Soup(pitchfork_page.doc.select('//div[@id="main"]/ul[@class="object-grid "]').html(), 'lxml') albums_on_page = [] for link in soup.find_all('a', href=True): albums_on_page.append('http://pitchfork.com' + link['href']) pool = ThreadPool(THREADS) pool.map(pitchfork, albums_on_page) page += 1 # if page > 1: # page_not_found = True except IndexError as error: print(error) page_not_found = True
def prepare_and_create_grab(url): cache_name = split_url_by_volume_and_chapter(url) dir_name = cache_name[0] file_name = cache_name[1] + '.html' file_path = os.path.join(generate_info_ranobe.DIR_RANOBE, 'cache', dir_name, file_name) data = None if not os.path.exists(os.path.dirname(file_path)): os.makedirs(os.path.dirname(file_path)) if not os.path.exists(file_path): g = Grab() g.go(url) with open(file_path, mode='w', encoding='utf8') as f: text = g.response.body f.write(text) if not data: data = text if not data: with open(file_path, encoding='utf8') as f: data = f.read() return Grab(data)
def __init__(self, steam_account, login_steam, pass_steam, code_link): Grab.__init__(self) # self.base_page = BasePage(self) self.steam_account = steam_account self.login_steam = login_steam self.pass_steam = pass_steam self.code_link = code_link self.steam_id = None self.session_id = None cookiefile = '../cookies/' + login_steam + '.txt' self.setup( headers={ 'Accept': "text/javascript, text/html, application/xml, text/xml, */*", 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 'X-Prototype-Version': '1.7', 'X-Requested-With': 'XMLHttpRequest' }, cookiefile=cookiefile, reuse_cookies=True, debug_post=True, log_file='../log_steam_account/log_' + str(self.login_steam) + '.html' )
def add_advert(): print("Add new advertisement.") g = Grab(log_file="2.html") g.load_cookies('cookies.txt') g.go("http://m.avito.ru/add") #login_test() from selenium.webdriver import Firefox from selenium.webdriver.common.keys import Keys import selenium from PIL import Image browser = Firefox() driver = selenium.webdriver.Firefox() browser.get('http://m.avito.ru/profile/login') driver.implicitly_wait(10) elem = driver.find_element_by_css_selector(".control-self.control-self-email") elem.send_keys("*****@*****.**") """ driver.find_element_by_name("password") element.send_keys("ivveqaem") driver.find_element_by_class_name("control-self control-self-submit button button-solid button-blue button-large") driver.find_element_by_partial_link_text("Войти") element.send_keys(Keys.ENTER) """ #browser.get('http://m.avito.ru/add') browser.save_screenshot('current_page') current_page_img = Image.open('current_page') w, h = current_page_img.size captcha_img = current_page_img#.crop((575, 505, w-155, h-1820)) captcha_img.save('captcha.jpg', 'jpeg')
def get_data(url): ''' Getting data(price and offers href) from Yandex Realt with client parameters ''' #print(url) price_list = [] href_list = [] g = Grab() g.go(url) # search html class with price data_list = g.xpath_list('//*[@class="serp-item__price"]') total = 0 for p in data_list: price = price_format(p.text_content()) total += price price_list.append(price) # search html class with href data_list = g.xpath_list('//*[@class="link link_redir_yes stat__click i-bem"]') for h in data_list: href_list.append(h.get('href')) if len(price_list) != 0: aver_price = total / len(price_list) return aver_price, href_list else: return 0, []
def assert_transport_pickle(self, transport, response): grab = Grab(transport=transport) grab2 = grab.clone() grab2_data = pickle.dumps(grab2, pickle.HIGHEST_PROTOCOL) grab3 = pickle.loads(grab2_data) grab3.go(self.server.get_url()) self.assertEqual(grab3.doc.body, response)
def main(): default_logging() for x in xrange(500): url = 'http://load.local/grab.html' g = Grab() g.go(url) assert 'grab' in g.response.body
def test_put(self): g = Grab() g.setup(post='abc', url=SERVER.BASE_URL, method='put', debug=True) SERVER.REQUEST['debug'] = True g.request() self.assertEqual(SERVER.REQUEST['method'], 'PUT') self.assertEqual(SERVER.REQUEST['headers']['content-length'], '3')
def get_phones_ad(self, ad_url): g = Grab() if self.proxy_enabled: g.setup(proxy=self.proxy_url, proxy_type=self.proxy_type) grab_go(g, ad_url) xpath = '//div[@class="noactual_adv"]' select = g.doc.select(xpath) if select.count() == 1: logger.info('Объявление удалено.') return [] xpath = '//div[@class="productPage__phoneText js-productPagePhoneLabel"]' select = g.doc.select(xpath) if select.count() == 0: logger.warn('Не нашел кнопки "Показать". xpath="%s".', xpath) return [] data_phone = select.attr('data-phone', None) if data_phone is None: logger.warn('Телефон не указан.') return [] logger.info('Закодированный в base64 телефон получен: "%s".', data_phone) # Декодирование из base64, а после приведение к типу str phone = base64.b64decode(data_phone) phone = phone.decode() logger.info('Декодированный телефон: "%s".', phone) return [phone]
def task_initial(self, grab, task): table = grab.xpath('//table[@class="DataGrid"]') del table[0] # Remove table header ip_port_list = [] for tr in table: ip = '' port = '' type = '' if u'IPDecode' in tr[0].text_content(): ip = decode_hex(tr[0].text_content().split('"')[1]) port = tr[1].text type = tr[2].text anonymity = tr[3].text country = tr[4].text_content() ip_port = ip + ':' + port ip_port_list.append(ip_port) print ip_port_list for ip in ip_port_list: grab = Grab() grab.setup(url='http://www.icanhazip.com') grab.setup(proxy=ip, proxy_type='http', connect_timeout=10, timeout=15) info = {'server': ip, 'type': 'http'} yield Task('proxy_check', grab=grab, info=info)
def test_nobody(self): g = Grab(transport=GRAB_TRANSPORT) g.setup(nobody=True) SERVER.RESPONSE['get'] = 'foo' g.go(SERVER.BASE_URL) self.assertEqual('', g.response.body) self.assertTrue(len(g.response.head) > 0)
class BaseParser(object): def __init__(self, rootpage): self.rootpage = rootpage self.grub = Grab() self.grub.setup(timeout = 15, connect_timeout = 10) def g(self): return self.grub @abstract def get_pagelinks(self): pass @abstract def get_company_list(self, pagelink): pass def parse(self): companies = [] self.g().go(self.rootpage) for link in self.get_pagelinks(): companies += self.get_company_list(link) print 'parse' time.sleep(uniform(300, 50)/1000.0) return companies
def test_body_maxsize(self): g = Grab(transport=GRAB_TRANSPORT) g.setup(body_maxsize=100) SERVER.RESPONSE['get'] = 'x' * 1024 * 1024 g.go(SERVER.BASE_URL) # Should be less 50kb self.assertTrue(len(g.response.body) < 50000)
def test_empty_useragent_pycurl(self): g = Grab(transport=GRAB_TRANSPORT) # Empty string disable default pycurl user-agent g.setup(user_agent='') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers'].get('user-agent', ''), '')
def get_item(self, content_type=None): grab = Grab(transport=GRAB_TRANSPORT) if content_type is not None: grab.setup(content_type=content_type) grab.fake_response(XML) player = Player(grab.tree) return player
class UltimateRewardsGrabber: def __init__(self): self.g = Grab() def grab(self): self.g.go(BASE_URL) divs = self.g.doc.select('//div[contains(@class, "mn_srchListSection")]') for div in divs: try: merchants = div.text().split('/$') for merchant in merchants: merchant = merchant.split('Details ')[1] title = ' '.join(merchant.split(' ')[:-2]) cost = merchant.split(' ')[-2] print title, ' - ', cost except IndexError: pass merchant = models.Item(title=title, cost=cost) db.session.add(merchant) db.session.commit() def save(self): pass
def translate(word, key, lan1='en', lan2='ru', alt=True, syn=True): """Prints the number of counts, word, translation, and example from lan1 to lan2 according to Translate.Google.""" # First, write down a translation in some auxiliary txt file # and load it in json format g = Grab(log_file = 'dict.txt') link = 'http://translate.google.ru/translate_a/t?client=x&text='\ + word + '&sl=' + lan1 + '&tl=' + lan2 g.go(link) data = json.load(open('dict.txt')) # Then, let's try to get all the necessary elements in json translation, noun, alternatives, synonims = 0, 0, 0, 0 try: translation = data[u'sentences'][0][u'trans'] noun = data[u'dict'][0][u'pos'] alternatives = data['dict'][0]['terms'] synonims = data['dict'][0]['entry'][0]['reverse_translation'] except: pass # German nouns should begin with capital letter if lan1=='de' and noun==u'имя существительное': word = word.title() # Finally, print out counts, word, translation with alternatives # and synonims, if applicable. Encoding is added up to allow # printing in cmd if you have a russian version of Windows if translation: print ('['+str(key)+']', word, ': ', translation) if alt and alternatives: [print (i, end=', ') for i in alternatives] print ('\r') if syn and synonims: [print (i.encode('cp866', errors='replace'), end=', ') for i in synonims] print ('\n')
def SaveImageYandex(text, imageCount, path, w='800', h='600'): global prefix prefix += 1 g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language': 'ru,en;q=0.8'}) query = urllib.urlencode({'text': text.encode('utf-8'), 'iw': w, 'ih': h}) url = 'http://images.yandex.ru/yandsearch?isize=gt&itype=jpg&'+query g.go(url) image_number = 0 f2 = open('out.txt', 'a') filename = str(prefix) + '-' + StringForFilename(text) + '.jpg' f2.write(filename + '\n') f2.close() while image_number < imageCount: image_number += 1 tmp = g.doc.select('//html/body/div[2]/div/div[2]/div[2]/div[1]/div[contains(@class, "b-images-item")][' + str(image_number) + ']').attr('onclick') match = re.search(r'"fullscreen":\{"url":"(.*?)"', tmp) if match: image_URL = match.group(1) print str(image_number) + '. ' + image_URL ext = GetFileExtFromURL(image_URL) filename = str(prefix) + '-' + StringForFilename(text) + '-' + str(image_number) + '.jpg' try: patht = os.path.join(path, filename) print patht urllib.urlretrieve(image_URL, patht) except: pass else: print 'Cant find image for this query ' + str(image_number)
def test_put(self): g = Grab() g.setup(post="abc", url=SERVER.BASE_URL, method="put", debug=True) SERVER.REQUEST["debug"] = True g.request() self.assertEqual(SERVER.REQUEST["method"], "PUT") self.assertEqual(SERVER.REQUEST["headers"]["content-length"], "3")
def just_print(mark): g = Grab() g.go(mark.url) body = g.doc.tree title = body.xpath('//*/head/title/text()') description = body.xpath('//*/meta[@name="description"]/@content') if title == []: title = u'Странно, но заголовок отстутствует' else: title = title[0] if description == []: description = body.xpath('//*/meta[@property="og:description"]/@content') if description == []: description = u'Описание отсутствует' else: description = description[0][0:200] else: description = description[0][0:200] p = re.compile("(.*\.\w{2,3})/") res = p.findall(mark.url)[0] favicon = res+'/favicon.ico' print('message from task') mark.title = title mark.description = description mark.favicon = favicon mark.save()
def get(dev_eui=getattr(settings, 'DEV_EUI', None), token=getattr(settings, 'TOKEN', None), limit=100): g = Grab() resp = g.go(URL.format(dev_eui=dev_eui, token=token, limit=limit)) return resp.json
def main(lookFor, jobTitle, company, tag): employerHeaderPageId = 1 questionTextPageId = 0 g = Grab() g.go(p(lookFor, jobTitle, company, tag, employerHeaderPageId)) employerHeader = g.xpath('//h1').text_content() f = open('Glassdoor.com ' + employerHeader + '.txt', 'w') f.write(smart_str(employerHeader) + ':\n') while True: g = Grab() questionTextPageId += 1 g.go(p(lookFor, jobTitle, company, tag, questionTextPageId)) if int(g.xpath('//li[@class="currPage"]').text) <= ( questionTextPageId - 1): print 'Finished at page: ' + g.xpath( '//li[@class="currPage"]').text + '!' break for questionText in g.xpath_list('//p[@class="questionText"]'): f.write(smart_str(questionText.text_content().strip()) + '\n') print 'Page # ' + g.xpath('//li[@class="currPage"]').text + ' parsed!'
def test_session(self): g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=True) SERVER.RESPONSE['cookies'] = {'foo': 'bar'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'bar') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar') g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=False) SERVER.RESPONSE['cookies'] = {'foo': 'baz'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'baz') g.go(SERVER.BASE_URL) self.assertTrue('Cookie' not in SERVER.REQUEST['headers']) g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=True) SERVER.RESPONSE['cookies'] = {'foo': 'bar'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'bar') g.clear_cookies() g.go(SERVER.BASE_URL) self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])
def build_grab(*args, **kwargs): """Builds the Grab instance with default options.""" kwargs.setdefault('transport', GLOBAL['grab_transport']) return Grab(*args, **kwargs)
def __grab_data(self): g = Grab() url = self.url.format(start=self.params['start'], end=self.params['end']) resp = g.go(url) self.grab_output = xmltodict.parse(resp.body)
def test_xml_with_declaration(self): SERVER.RESPONSE['get'] = '<?xml version="1.0" encoding="UTF-8"?><root><foo>foo</foo></root>' g = Grab(transport=GRAB_TRANSPORT) g.go(SERVER.BASE_URL) self.assertTrue(g.xpath_one('//foo').text == 'foo')
names.append(name) dobs.append(dob) races.append(race) elif img3 != "" and os.path.isfile('dataset/' + img3): genders.append(gender) imgs.append(img3) names.append(name) dobs.append(dob) races.append(race) print len(names) print "GENDERS: " print set(genders) print "RACES: " print set(races) g = Grab() output_file = open(sys.argv[2], 'w') for img, name, dob, race, gender in itertools.izip(imgs, names, dobs, races, genders): fields = name.split(' ') first = fields[0] numNames = len(fields) if len(fields[-1]) <= 3 and numNames > 2 and len(fields[numNames - 2]) > 3: last = fields[numNames - 2] else: last = fields[-1] lookup = 'http://webapps6.doc.state.nc.us/opi/offendersearch.do?method=list&searchLastName=' + last + '&searchFirstName=' + first + '&searchDOB=' + dob + '&searchDOBRange=0' #print lookup g.go(lookup) if g.doc.text_search(u'Nothing found'):
def main(): print('\n-- Парсинг афиши Драмтеатра - ' + str(datetime.now())) month = { 'января': '01', 'февраля': '02', 'марта': '03', 'апреля': '04', 'мая': '05', 'июня': '06', 'июля': '07', 'августа': '08', 'сентября': '09', 'октября': '10', 'ноября': '11', 'декабря': '12' } drama = Grab(timeout=20, connect_timeout=20) drama.go('http://quicktickets.ru/teatr-dramy-viktora-savina') titles = drama.doc.select( '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/h3') descriptions = drama.doc.select( '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/div[@class="d"]' ) seanses = drama.doc.select( '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/div[@class="row sessions sessions-near"]' ) now_month = date.today().month now_year = date.today().year next_year = now_year + 1 #вычисляем первую дату для выборки из базы - проверка на уже загруженные даты start_date = drama.doc.select( '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/div[@class="row sessions sessions-near"]//a' ).text() start_date = start_date.replace(',', '').split(' ') if now_month in (10, 11, 12) and int(month[start_date[1]]) in (1, 2): start_date = date(next_year, int(month[start_date[1]]), int(start_date[0])) else: start_date = date(now_year, int(month[start_date[1]]), int(start_date[0])) exist_date_event = last_date_event('dramakomi', start_date) #отрабатываем события for title, desc, seans in zip(titles, descriptions, seanses): for date_time in seans.select('.//a'): date_time = date_time.text().replace(',', '').split(' ') time = date_time[2] if now_month in (10, 11, 12) and int( month[date_time[1]]) in (1, 2): date_time = date(next_year, int(month[date_time[1]]), int(date_time[0])) else: date_time = date(now_year, int(month[date_time[1]]), int(date_time[0])) if exist_date_event.count(date_time.strftime("%Y-%m-%d")): print(date_time.strftime("%Y-%m-%d") + ' уже есть') else: event = { 'name': title.text(), 'date': date_time.strftime("%Y-%m-%d"), 'time': time, 'type_event': 'teatr', 'type_film': '', 'price': 0, 'source_id': 5, #драмтеатр 'description': desc.text(), 'poster': '' } write_event_to_db(event)
__author__ = 'ipetrash' """Скрипт возвращает содержимое gitignore для языков программирования""" if __name__ == '__main__': from grab import Grab g = Grab() lang = input("Input: ") g.go("https://www.gitignore.io/api/" + lang) print(g.response.body)
from grab import Grab url = 'https://www.htc.com/tw/' response = Grab().go(url)
def create_grab_instance(self): return Grab(**self.grab_config)
from grab import Grab import sys g = Grab() g.go('http://demo.caffe.berkeleyvision.org/classify_url?imageurl=' + sys.argv[1]) i = 0 for elem in g.doc.select('//ul/li/h4/a'): print '%s' % (elem.text()) i = i + 1 if i >= 5: break
def init_grab(self): return Grab(log_dir='log', hammer_mode=True)
def __init__(self, username, password): self.g = Grab() self.login(username, password)
def task_generator(self): for query, tag in settings.QUERY_LIST: g = Grab() g.setup(url=self.build_query_url(query), content_type='xml') yield Task('feed', grab=g, query=query, tag=tag)
# Ленинск-Кузнецкий lat=54&lon=86 lat = '54.65' # Home 9 86.184826%2C54.681399 lng = '86.18' # Home 9 lat=54.643689&lon=86.199094 # lon : 86.17, lat : 54.67 # Орел lato = '53.0' lngo = '36' lang = 'ru' radius = 50 types = '1,2' appid = '******************************' #getapiuri = 'http://narodmon.ru/api/sensorsNearby?lat=54.65&lng=86.18&radius=50&types=1,2&uuid=6ce5e6b78477f27084cc524599fc5930&api_key=09XImZqvP6g6U&lang=ru' geturi = f'http://narodmon.ru/api/sensorsNearby?lat={lat}&lng={lng}&radius={radius}&uuid={uuid}&api_key={api_key}&lang={lang}' wing = Grab(timeout=300) wing.go("https://yandex.ru/pogoda/leninsk-kuznetskiy/details") #wing.go("https://yandex.ru/pogoda/leninsk-kuznetskiy") oblak = Grab(timeout=300) oblak.go("https://yandex.ru/pogoda/leninsk-kuznetskiy") WeHtm = requests.post(geturi, headers=headers).text #print (f'http://narodmon.ru/api/sensorsNearby?lat={lat}&lng={lng}&radius={radius}&uuid={uuid}&api_key={api_key}&lang={lang}') devd = 2 devt = -1 tra = 0 senst = 0 sensd = 1 fact = json.loads(WeHtm) unit = fact['devices'][devt]['sensors'][senst]['unit']
from grab import Grab import json from datetime import datetime, timedelta g = Grab(connect_timeout=90, timeout=90) nowTime = datetime.now() departures = [] arrivals = [] urlNGO = "http://www.centrair.jp/en/flight_information/today/result/" datePattern = "%Y-%m-%d %H:%M:%S" GO_TO_GATE = "go to gate" CHECK_IN = "check-in" BOARD_SOON = "board soon" ARRIVING = "arriving" GATE_CLOSED = "gate closed" FINAL_CALL = "final call" ARRIVED = "arrived" LATE = "late" LAST_CALL = "last call" GATE_OPEN = "gate open" SCHEDULED = "scheduled" DELAYED = "delayed" CANCELLED = "cancelled" CHECKIN = "checkin" BOARDING = "boarding" OUTGATE = "outgate" DEPARTED = "departed" EXPECTED = "expected"
from grab import Grab, UploadFile import logging logging.basicConfig(level=logging.DEBUG) g = Grab() g.setup(log_dir='log/grab') g.go('https://afisha.tut.by/film/', log_file='out.html') g.setup(post={'hi': u'Превед, яндекс!'}) g.request()
from grab import Grab import json from datetime import datetime, timedelta import re #17:20 - 19:20 g = Grab(connect_timeout=90, timeout=90) g.setup(headers={"X-Requested-With": "XMLHttpRequest"}) CHECK_IN = "check-in" BOARDING_CLOSED = "boarding closed" ON_TIME = "on time" CANCELED = "canceled" AIRBORNE = "airborne" ESTIMSTED = "estimated" ARRIVED = "arrived" LATE = "late" LAST_CALL = "last call" GATE_CLOSED = "gate closed" FINAL_CALL = "final call" GATE_OPEN = "gate open" SCHEDULED = "scheduled" DELAYED = "delayed" CANCELLED = "cancelled" CHECKIN = "checkin" BOARDING = "boarding" OUTGATE = "outgate" DEPARTED = "departed" EXPECTED = "expected"
from grab import Grab import logging logging.basicConfig(level=logging.DEBUG) g = Grab() g.go('http://habrahabr.ru') g.xpath('//h2/a[@class="topic"]').get('href') print(g.xpath_text('//h2/a[@class="topic"]')) print(g.css_text('h2 a.topic')) print('Comments:', g.css_number('.comments .all')) from urllib.parse import urlsplit print(', '.join(urlsplit(x.get('href')).netloc for x in g.css_list('.hentry a') if not 'habrahabr.ru' in x.get('href') and x.get('href').startswith('http:')))
def test_cookies_parsing(self): g = Grab(transport=GRAB_TRANSPORT) SERVER.RESPONSE['cookies'] = {'foo': 'bar', '1': '2'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'bar')
from grab import Grab g = Grab() def nic_ua(): prices = {} g.go('http://nic.ua/ukr/tariffs.html') repl = ( (',', '.'), # swich separator for float() conversion ('\xa0', ''), # remove space separating thousands ('Безкоштовно', '0.0'), ('—', '0.0')) for element in g.css_list('.domain-name'): tld = element.text_content() # lower case without dot price = element.getparent().getparent().getnext().text_content() price = price.strip(' \xa0₴\n') for i in repl: price = price.replace(*i) prices[tld] = float(price) return prices if __name__ == '__main__': prices = nic_ua() for tld in prices: print('{:<20s}{:>8} '.format(tld, prices[tld])) print(len(prices)) # 304 TLDs
def test_load_dump(self): g = Grab(transport=GRAB_TRANSPORT) cookies = {'foo': 'bar', 'spam': 'ham'} g.setup(cookies=cookies) g.dump_cookies(TMP_FILE) self.assertEqual(set(cookies.items()), set(json.load(open(TMP_FILE)).items())) # Test non-ascii g = Grab(transport=GRAB_TRANSPORT) cookies = {'foo': 'bar', 'spam': u'бегемот'} g.setup(cookies=cookies) g.dump_cookies(TMP_FILE) self.assertEqual(set(cookies.items()), set(json.load(open(TMP_FILE)).items())) # Test load cookies g = Grab(transport=GRAB_TRANSPORT) cookies = {'foo': 'bar', 'spam': u'бегемот'} json.dump(cookies, open(TMP_FILE, 'w')) g.load_cookies(TMP_FILE) self.assertEqual(set(g.config['cookies'].items()), set(cookies.items()))
def get_check_solution_request(self, captcha_id): params = {'key': self.api_key, 'action': 'get', 'id': captcha_id} url = 'http://antigate.com/res.php?%s' % urlencode(params) g = Grab() g.setup(url=url) return g
def logWebPages(attribute): g = Grab() g.go('http://horo.mail.ru/prediction/' + attribute + '/today/', log_file='logs/' + attribute + '.txt')
def setUp(self): # Create fake grab instance with fake response self.g = Grab(HTML, charset='cp1251')
from grab import Grab import pyexcel url = 'http://ruticker.com/ReportTopOrders?ticker=siz4&bigPeriod=1' g = Grab() g.setup(post={'username': "******", 'password': "******"}) g.go(url) a = [] b = [] for i in g.doc.select("//tr/td"): a.append(i.text()) with open('big_trades.xls', 'wt') as f: for elem in a: b.append(' '.join(str(el) for el in a[:5])) f.write(''.join(','.join(str(el) for el in a[:5]))) f.write(u'\n') del a[:5]
result["estimated"] = estimated if actual != "": result["actual"] = actual if gate != "": result["gate"] = gate if check_in_desks != "": result["check_in_desks"] = check_in_desks return result def getTimeStampFromDateTime(datetime): return int(time.mktime(datetime.timetuple()) + datetime.microsecond / 1E6) # ARRIVE g = Grab(connect_timeout=90, timeout=90) urlARH = "http://arhaero.ru/ajaxonlinetablo.php" # Yesterday yesterdayTime = nowTime - timedelta(days=1) yesterdayTimeStamp = getTimeStampFromDateTime(yesterdayTime) todayArrivalsParams = {"date": yesterdayTimeStamp, "type": "arrival"} resp = g.go(urlARH, post=todayArrivalsParams) i = 0 for el in g.doc.select("//table/tbody/tr"): i += 1 if i == 1: continue
def test_flask2(self): g = Grab(transport=GRAB_TRANSPORT) g.go(BASE_URL + '?foo=5') self.assertEqual(REQUEST['args']['foo'], '5')
import json from time import sleep import random from urllib import parse from grab import Grab base_url = 'https://api.crossref.org/works?' # filter for parsing only articles published in journals, have abstracts and published until 2019 # for more information about filters see manual: https://github.com/CrossRef/rest-api-doc#filter-names filter = 'filter=type:journal-article,has-abstract:t,until-pub-date:2019' ppath = os.path.dirname(sys.argv[0]) # path to the script location paper_collection = ppath + '/../texts/papers_crossref.txt' # path to file for saving parsed data ud_dois = ppath + '/../texts/ud_dois.txt' # # path to file with list of DOI to saved papers # initialization and settings of grab object g = Grab(log_file=ppath + '/../temp/out_crossref.html') g.setup(cookiefile=ppath + '/../temp/cookies_pars.txt', reuse_referer='True', timeout=120) g.setup(user_agent='CitePrediction/0.1_alpha; mailto:[email protected]') # reading used DOIs from file and generating list with used DOIs for exclusion them from double retrieving used_dois = [] with open(ud_dois, 'r') as dois: used_dois = dois.readlines() used_dois = [x.strip() for x in used_dois] #url = base_url+filter+'&rows=1000'+'&cursor=*' # parsing papers consequentially. Busting the same DOIs every time the script staring if cursor will not saved url = base_url + filter + '&sample=100' # random sample of papers from database. As much papers will parsed, as much doubles it got # first batch of papers