예제 #1
0
def parse():

    g = Grab()
    base_url = 'https://www.buzzfeed.com'
    appendix_1 = '/?p='
    topics = ['world', 'politics', 'business', 'lgbt', 'tech', 'science', 'music', 'animals', 'travel', 'style', 'sports']

    data = {}
    for topic in topics:
        articles_list = []
        for page in range(1, 10):
            time.sleep(0.2)
            g.go(base_url + '/' + topic + appendix_1 + str(page))
            urls = getPageUrls(g.response.body)
            for url in urls:
                g.go(base_url + url)
                article = getArticle(g.response.body)
                if len(article) > 1:
                    articles_list.append(article)
        data.update({topic: articles_list})

    data_size = 0
    for topic in data.keys():
        data_size += len(data[topic])
    print "{} articles in {} topics".format(data_size, len(data))
예제 #2
0
def login_test():
    g = Grab(log_file="1.html")
    g.go("http://m.avito.ru/profile")
    g.doc.set_input("login","login")
    g.doc.set_input("password","password")
    g.doc.submit()
    g.cookies.save_to_file('cookies.txt')
예제 #3
0
파일: main.py 프로젝트: bilabon/EasyParser
 def get_source_page(self, search_text):
     """Getting a source page by given search parameter"""
     grab = Grab()
     grab.go(u"https://ya.ru/")
     grab.doc.set_input(u"text", search_text)
     grab.doc.submit()
     return grab.response.body
예제 #4
0
    def test_useragent_simple(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Simple case: setup user agent manually
        g.setup(user_agent='foo')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['user-agent'], 'foo')
예제 #5
0
파일: smstwo.py 프로젝트: stden/colortek
 def forge_request(self, **kwargs):
     g = Grab()
     data = {
         'user': self.user,
         'pass': self.password,
     }
     url = "%ssms.cgi" % SMS_TWO_PREFIX
     if 'url' in kwargs:
         url = '%s%s' % (SMS_TWO_PREFIX, kwargs['url'])
         del kwargs['url']
     if 'frm' in kwargs:
         data.update({'from': kwargs['frm']})
         del kwargs['frm']
     data.update(kwargs)
     post = urllib.urlencode(data)
     #request = urllib2.Request(url, post)
     #grab implementation
     g.setup(post=data)
     try:
         self._response = g.go(url)
     except GrabError, e:
         self.http_error = {
             'code': e[0],
             'content': e[1]
         }
예제 #6
0
def get_links(page, grab_=None):
    if grab_ is None:
        grab_ = Grab()
    grab_.go(page)
    return [
        'http://tagbrand.com%s' % link.attr('href')
        for link in grab_.doc.select('//dl[*]/dd/p[1]/a')]
예제 #7
0
def parse_famous(year, month, day):
    '''
    parse famous from famousbirthdays.com
    by month day
    year is ignore now
    '''
    months = get_months()
    url = 'http://www.famousbirthdays.com/%s%d.html' % (months[month], day)

    g = Grab()
    g.setup()
    g.go(url)

    elements = g.doc.select('//ul[@class="top-celebrity-col4 col1"]/li')
    list = []

    for element in elements:
        src = element.node.getchildren()[1].getchildren()[0].getchildren()[0].get('src')
        age = element.node.getchildren()[2].getchildren()[0].text_content().split(' ')[-1]
        name = element.node.getchildren()[2].getchildren()[0].getchildren()[0].text_content()
        description = element.node.getchildren()[2].getchildren()[1].text_content()

        list.append({'src': src, 'name': name, 'age': age, 'description': description})

    return list
예제 #8
0
파일: parser.py 프로젝트: Firik/4pda-parser
def parse(last_page=1):
    i = 1
    print('Last page is {0}'.format(last_page))
    for x in range(1, last_page + 1):
        main_domain = 'http://4pda.ru/page/{0}/'.format(x)

        g = Grab()
        g.go(main_domain)
        nodes = g.doc.select('//article[@class="post"]').node_list()
        if nodes:
            try:
                f = open('4pda.csv', 'x')
                writer = csv.writer(f)
                writer.writerow(['№', 'Заголовок', 'Дата публикации', 'Ссылка'])
            except FileExistsError:
                f = open('4pda.csv', 'a')
                writer = csv.writer(f)
            finally:
                for n, node in enumerate(nodes):
                    header = node.xpath('//div[@class="description"]//h1//span')
                    links = node.xpath('//div[@class="description"]//h1//a')
                    dates = node.xpath('//div//div//div//em')
                    writer.writerow([
                        i,
                        header[n].text,
                        dates[n].text,
                        links[n].attrib['href']
                    ])
                    i += 1
                f.close()
                print(x)
        else:
            return 'Posts not found'
    return 'Job done.'
예제 #9
0
def getModelLink(modelName):
    g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language':    'ru,en;q=0.8'})
    url = 'http://market.yandex.ru/'
    g.go(url)
    try:
        paginatorHTML = g.doc.select(popt['pagination']).html()
        pagesLinks = GetAllLinksFromString(paginatorHTML, url)
    except:
        pagesLinks = []
    pagesLinks.append(url)
    pagesLinks = list(set(pagesLinks))
    pagesCount = pagesLinks.__len__()
    newPagesCount = 1
    while pagesCount != newPagesCount:
        lastPage = pagesLinks.__len__() - 1
        url = pagesLinks[lastPage]
        g.go(url)
        try:
            paginatorHTML = g.doc.select(popt['pagination']).html()
            newlinks = GetAllLinksFromString(paginatorHTML, url)
        except:
            newlinks = []
        for newlink in newlinks:
            pagesLinks.append(newlink)
        pagesLinks = list(set(pagesLinks))
        newPagesCount = pagesLinks.__len__()
    return pagesLinks
예제 #10
0
    def check_following(self, url, token_id, loyalty_id):
        follow = False
        self.refresh_token(token_id)
        soc_token = SocToken.query.get(token_id)
        action = PaymentLoyalty.query.get(loyalty_id)
        target = json.loads(action.data)

        g = Grab()
        g.setup(headers={'Authorization': 'Bearer ' + soc_token.user_token})
        url_api = self.API_PATH + self.API_PARTS['subscriptions']

        while not follow:
            g.go(url_api)
            subscriptions = json.loads(g.response.body)

            if 'items' not in subscriptions:
                break
            if len(subscriptions['items']) <= 0:
                break

            for subscribe in subscriptions['items']:
                if 'snippet' in subscribe and 'channelId' in subscribe['snippet'] and subscribe['snippet']['channelId'] == target['channelId']:
                    follow = True

            if 'nextPageToken' not in subscriptions:
                break
            if len(subscriptions['nextPageToken']) <= 0:
                break

            url_api = "%s%s&pageToken=%s" % (
                self.API_PATH,
                self.API_PARTS['subscriptions'],
                subscriptions['nextPageToken'])

        return follow
예제 #11
0
파일: pitchfork.py 프로젝트: thzvm/Python
def start():

    CSVFile(header=['Artist', 'Album', 'Genre', 'Style', 'Year', 'Rating'])
    page = 1
    page_not_found = None
    while page_not_found == None:

        try:
            print('Page', page)

            pitchfork_page = Grab()
            pitchfork_page.go(PITC_URL + str(page))
            soup = Soup(pitchfork_page.doc.select('//div[@id="main"]/ul[@class="object-grid "]').html(), 'lxml')
            albums_on_page = []

            for link in soup.find_all('a', href=True):
                albums_on_page.append('http://pitchfork.com' + link['href'])

            pool = ThreadPool(THREADS)

            pool.map(pitchfork, albums_on_page)

            page += 1

            # if page > 1:
            #   page_not_found = True

        except IndexError as error:
            print(error)
            page_not_found = True
예제 #12
0
def prepare_and_create_grab(url):

    cache_name = split_url_by_volume_and_chapter(url)
    dir_name = cache_name[0]
    file_name = cache_name[1] + '.html'
    file_path = os.path.join(generate_info_ranobe.DIR_RANOBE, 'cache', dir_name, file_name)
    data = None

    if not os.path.exists(os.path.dirname(file_path)):
        os.makedirs(os.path.dirname(file_path))

    if not os.path.exists(file_path):
        g = Grab()
        g.go(url)
        with open(file_path, mode='w', encoding='utf8') as f:
            text = g.response.body
            f.write(text)
            if not data:
                data = text

    if not data:
        with open(file_path, encoding='utf8') as f:
            data = f.read()

    return Grab(data)
예제 #13
0
    def __init__(self, steam_account, login_steam, pass_steam, code_link):
        Grab.__init__(self)
        # self.base_page = BasePage(self)
        self.steam_account = steam_account
        self.login_steam = login_steam
        self.pass_steam = pass_steam
        self.code_link = code_link
        self.steam_id = None
        self.session_id = None

        cookiefile = '../cookies/' + login_steam + '.txt'
        self.setup(
            headers={
                'Accept': "text/javascript, text/html, application/xml, text/xml, */*",
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
                'X-Prototype-Version': '1.7',
                'X-Requested-With': 'XMLHttpRequest'
            },
            cookiefile=cookiefile,
            reuse_cookies=True,
            debug_post=True,
            log_file='../log_steam_account/log_' + str(self.login_steam) + '.html'
        )
예제 #14
0
def add_advert():
    print("Add new advertisement.")
    g = Grab(log_file="2.html")
    g.load_cookies('cookies.txt')
    g.go("http://m.avito.ru/add")
    #login_test()
    from selenium.webdriver import Firefox
    from selenium.webdriver.common.keys import Keys
    import selenium
    from PIL import Image

    browser = Firefox()
    driver = selenium.webdriver.Firefox()

    browser.get('http://m.avito.ru/profile/login')
    driver.implicitly_wait(10)
    elem = driver.find_element_by_css_selector(".control-self.control-self-email")
    elem.send_keys("*****@*****.**")
    """
    driver.find_element_by_name("password")
    element.send_keys("ivveqaem")
    driver.find_element_by_class_name("control-self control-self-submit button button-solid button-blue button-large")
    driver.find_element_by_partial_link_text("Войти")
    element.send_keys(Keys.ENTER)
"""
    #browser.get('http://m.avito.ru/add')
    browser.save_screenshot('current_page')
    current_page_img = Image.open('current_page')
    w, h = current_page_img.size
    captcha_img = current_page_img#.crop((575, 505, w-155, h-1820))
    captcha_img.save('captcha.jpg', 'jpeg')
예제 #15
0
파일: pars.py 프로젝트: esha-/esca
def get_data(url):
    '''
    Getting data(price and offers href) from Yandex Realt with client parameters
    '''
    #print(url)

    price_list = []
    href_list = []

    g = Grab()
    g.go(url)

    # search html class with price
    data_list = g.xpath_list('//*[@class="serp-item__price"]')
    total = 0
    for p in data_list:
        price = price_format(p.text_content())
        total += price
        price_list.append(price)
    
    # search html class with href
    data_list = g.xpath_list('//*[@class="link link_redir_yes stat__click i-bem"]')
    for h in data_list:
        href_list.append(h.get('href'))

    if len(price_list) != 0:
        aver_price = total / len(price_list)
        return aver_price, href_list
    else:
        return 0, []
예제 #16
0
 def assert_transport_pickle(self, transport, response):
     grab = Grab(transport=transport)
     grab2 = grab.clone()
     grab2_data = pickle.dumps(grab2, pickle.HIGHEST_PROTOCOL)
     grab3 = pickle.loads(grab2_data)
     grab3.go(self.server.get_url())
     self.assertEqual(grab3.doc.body, response)
예제 #17
0
파일: speed_grab.py 프로젝트: ArturFis/grab
def main():
    default_logging()
    for x in xrange(500):
        url = 'http://load.local/grab.html'
        g = Grab()
        g.go(url)
        assert 'grab' in g.response.body
예제 #18
0
 def test_put(self):
     g = Grab()
     g.setup(post='abc', url=SERVER.BASE_URL, method='put', debug=True)
     SERVER.REQUEST['debug'] = True
     g.request()
     self.assertEqual(SERVER.REQUEST['method'], 'PUT')
     self.assertEqual(SERVER.REQUEST['headers']['content-length'], '3')
    def get_phones_ad(self, ad_url):
        g = Grab()
        if self.proxy_enabled:
            g.setup(proxy=self.proxy_url, proxy_type=self.proxy_type)

        grab_go(g, ad_url)

        xpath = '//div[@class="noactual_adv"]'
        select = g.doc.select(xpath)
        if select.count() == 1:
            logger.info('Объявление удалено.')
            return []

        xpath = '//div[@class="productPage__phoneText js-productPagePhoneLabel"]'
        select = g.doc.select(xpath)
        if select.count() == 0:
            logger.warn('Не нашел кнопки "Показать". xpath="%s".', xpath)
            return []

        data_phone = select.attr('data-phone', None)
        if data_phone is None:
            logger.warn('Телефон не указан.')
            return []

        logger.info('Закодированный в base64 телефон получен: "%s".', data_phone)

        # Декодирование из base64, а после приведение к типу str
        phone = base64.b64decode(data_phone)
        phone = phone.decode()

        logger.info('Декодированный телефон: "%s".', phone)
        return [phone]
예제 #20
0
    def task_initial(self, grab, task):
        table = grab.xpath('//table[@class="DataGrid"]')
        del table[0]    # Remove table header

        ip_port_list = []
        for tr in table:
            ip = ''
            port = ''
            type = ''
            if u'IPDecode' in tr[0].text_content():
                ip = decode_hex(tr[0].text_content().split('"')[1])
                port = tr[1].text
                type = tr[2].text
                anonymity = tr[3].text
                country = tr[4].text_content()
                ip_port = ip + ':' + port
                ip_port_list.append(ip_port)

        print ip_port_list

        for ip in ip_port_list:
            grab = Grab()
            grab.setup(url='http://www.icanhazip.com')
            grab.setup(proxy=ip, proxy_type='http', connect_timeout=10, timeout=15)
            info = {'server': ip, 'type': 'http'}
            yield Task('proxy_check', grab=grab, info=info)
예제 #21
0
 def test_nobody(self):
     g = Grab(transport=GRAB_TRANSPORT)
     g.setup(nobody=True)
     SERVER.RESPONSE['get'] = 'foo'
     g.go(SERVER.BASE_URL)
     self.assertEqual('', g.response.body)
     self.assertTrue(len(g.response.head) > 0)
예제 #22
0
class BaseParser(object):
	def __init__(self, rootpage):
		self.rootpage = rootpage
		self.grub = Grab()
		self.grub.setup(timeout = 15, connect_timeout = 10)

	def g(self):
		return self.grub

	@abstract
	def get_pagelinks(self):
		pass

	@abstract
	def get_company_list(self, pagelink):
		pass

	def parse(self):
		companies = []
		self.g().go(self.rootpage)
		for link in self.get_pagelinks():
			companies += self.get_company_list(link)
			print 'parse'
			time.sleep(uniform(300, 50)/1000.0)
		return companies
예제 #23
0
 def test_body_maxsize(self):
     g = Grab(transport=GRAB_TRANSPORT)
     g.setup(body_maxsize=100)
     SERVER.RESPONSE['get'] = 'x' * 1024 * 1024
     g.go(SERVER.BASE_URL)
     # Should be less 50kb
     self.assertTrue(len(g.response.body) < 50000)
예제 #24
0
    def test_empty_useragent_pycurl(self):
        g = Grab(transport=GRAB_TRANSPORT)

        # Empty string disable default pycurl user-agent
        g.setup(user_agent='')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers'].get('user-agent', ''), '')
예제 #25
0
파일: item.py 프로젝트: mjhea0/grab
 def get_item(self, content_type=None):
     grab = Grab(transport=GRAB_TRANSPORT)
     if content_type is not None:
         grab.setup(content_type=content_type)
     grab.fake_response(XML)
     player = Player(grab.tree)
     return player
예제 #26
0
파일: scrape.py 프로젝트: EvilDmitri/Jasper
class UltimateRewardsGrabber:

    def __init__(self):
        self.g = Grab()

    def grab(self):
        self.g.go(BASE_URL)
        divs = self.g.doc.select('//div[contains(@class, "mn_srchListSection")]')
        for div in divs:
            try:
                merchants = div.text().split('/$')
                for merchant in merchants:
                    merchant = merchant.split('Details ')[1]
                    title = ' '.join(merchant.split(' ')[:-2])
                    cost = merchant.split(' ')[-2]
                    print title, ' - ', cost
            except IndexError:
                pass
            merchant = models.Item(title=title, cost=cost)
            db.session.add(merchant)
        db.session.commit()


    def save(self):
        pass
예제 #27
0
def translate(word, key, lan1='en', lan2='ru', alt=True, syn=True):
    """Prints the number of counts, word, translation, and example
    from lan1 to lan2 according to Translate.Google."""
    # First, write down a translation in some auxiliary txt file
    # and load it in json format
    g = Grab(log_file = 'dict.txt')
    link = 'http://translate.google.ru/translate_a/t?client=x&text='\
           + word + '&sl=' + lan1 + '&tl=' + lan2
    g.go(link)
    data = json.load(open('dict.txt'))
    # Then, let's try to get all the necessary elements in json
    translation, noun, alternatives, synonims = 0, 0, 0, 0
    try:
        translation = data[u'sentences'][0][u'trans']
        noun = data[u'dict'][0][u'pos']
        alternatives = data['dict'][0]['terms']
        synonims = data['dict'][0]['entry'][0]['reverse_translation']
    except:
        pass
    # German nouns should begin with capital letter
    if lan1=='de' and noun==u'имя существительное':
        word = word.title()
    # Finally, print out counts, word, translation with alternatives
    # and synonims, if applicable. Encoding is added up to allow
    # printing in cmd if you have a russian version of Windows
    if translation:
        print ('['+str(key)+']', word, ': ', translation)
        if alt and alternatives:
            [print (i, end=', ') for i in alternatives]
            print ('\r')
        if syn and synonims:
            [print (i.encode('cp866', errors='replace'), end=', ')
                                     for i in synonims]
            print ('\n')
예제 #28
0
def SaveImageYandex(text, imageCount, path, w='800', h='600'):
    global prefix
    prefix += 1
    g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language':    'ru,en;q=0.8'})
    query = urllib.urlencode({'text': text.encode('utf-8'), 'iw': w, 'ih': h})
    url = 'http://images.yandex.ru/yandsearch?isize=gt&itype=jpg&'+query
    g.go(url)
    image_number = 0
    f2 = open('out.txt', 'a')
    filename = str(prefix) + '-' + StringForFilename(text) + '.jpg'
    f2.write(filename + '\n')
    f2.close()
    while image_number < imageCount:
        image_number += 1
        tmp = g.doc.select('//html/body/div[2]/div/div[2]/div[2]/div[1]/div[contains(@class, "b-images-item")]['
                           + str(image_number) + ']').attr('onclick')
        match = re.search(r'"fullscreen":\{"url":"(.*?)"', tmp)
        if match:
            image_URL = match.group(1)
            print str(image_number) + '. ' + image_URL
            ext = GetFileExtFromURL(image_URL)
            filename = str(prefix) + '-' + StringForFilename(text) + '-' + str(image_number) + '.jpg'
            try:
                patht = os.path.join(path, filename)
                print patht
                urllib.urlretrieve(image_URL, patht)
            except:
                pass
        else:
            print 'Cant find image for this query ' + str(image_number)
예제 #29
0
 def test_put(self):
     g = Grab()
     g.setup(post="abc", url=SERVER.BASE_URL, method="put", debug=True)
     SERVER.REQUEST["debug"] = True
     g.request()
     self.assertEqual(SERVER.REQUEST["method"], "PUT")
     self.assertEqual(SERVER.REQUEST["headers"]["content-length"], "3")
예제 #30
0
def just_print(mark):
    g = Grab()
    g.go(mark.url)
    body = g.doc.tree
    title = body.xpath('//*/head/title/text()')
    description = body.xpath('//*/meta[@name="description"]/@content')

    if title == []:
        title = u'Странно, но заголовок отстутствует'
    else:
        title = title[0]

    if description == []:
        description = body.xpath('//*/meta[@property="og:description"]/@content')
        if description == []:
            description = u'Описание отсутствует'
        else:
            description = description[0][0:200]
    else:
        description = description[0][0:200]

    p = re.compile("(.*\.\w{2,3})/")
    res = p.findall(mark.url)[0]
    favicon = res+'/favicon.ico'
    print('message from task')

    mark.title = title
    mark.description = description
    mark.favicon = favicon
    mark.save()
예제 #31
0
파일: crawler.py 프로젝트: garncarz/iot
def get(dev_eui=getattr(settings, 'DEV_EUI', None),
        token=getattr(settings, 'TOKEN', None),
        limit=100):
    g = Grab()
    resp = g.go(URL.format(dev_eui=dev_eui, token=token, limit=limit))
    return resp.json
예제 #32
0
def main(lookFor, jobTitle, company, tag):
    employerHeaderPageId = 1
    questionTextPageId = 0
    g = Grab()
    g.go(p(lookFor, jobTitle, company, tag, employerHeaderPageId))
    employerHeader = g.xpath('//h1').text_content()
    f = open('Glassdoor.com ' + employerHeader + '.txt', 'w')
    f.write(smart_str(employerHeader) + ':\n')
    while True:
        g = Grab()
        questionTextPageId += 1
        g.go(p(lookFor, jobTitle, company, tag, questionTextPageId))
        if int(g.xpath('//li[@class="currPage"]').text) <= (
                questionTextPageId - 1):
            print 'Finished at page: ' + g.xpath(
                '//li[@class="currPage"]').text + '!'
            break
        for questionText in g.xpath_list('//p[@class="questionText"]'):
            f.write(smart_str(questionText.text_content().strip()) + '\n')
        print 'Page # ' + g.xpath('//li[@class="currPage"]').text + ' parsed!'
예제 #33
0
파일: cookies.py 프로젝트: artemzi/grab
    def test_session(self):
        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')

        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=False)
        SERVER.RESPONSE['cookies'] = {'foo': 'baz'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'baz')
        g.go(SERVER.BASE_URL)
        self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])

        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.clear_cookies()
        g.go(SERVER.BASE_URL)
        self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])
예제 #34
0
파일: util.py 프로젝트: sn-donbenjamin/grab
def build_grab(*args, **kwargs):
    """Builds the Grab instance with default options."""
    kwargs.setdefault('transport', GLOBAL['grab_transport'])
    return Grab(*args, **kwargs)
예제 #35
0
 def __grab_data(self):
     g = Grab()
     url = self.url.format(start=self.params['start'], end=self.params['end'])
     resp = g.go(url)
     self.grab_output = xmltodict.parse(resp.body)
예제 #36
0
 def test_xml_with_declaration(self):
     SERVER.RESPONSE['get'] = '<?xml version="1.0" encoding="UTF-8"?><root><foo>foo</foo></root>'
     g = Grab(transport=GRAB_TRANSPORT)
     g.go(SERVER.BASE_URL)
     self.assertTrue(g.xpath_one('//foo').text == 'foo')
예제 #37
0
                names.append(name)
                dobs.append(dob)
                races.append(race)
            elif img3 != "" and os.path.isfile('dataset/' + img3):
                genders.append(gender)
                imgs.append(img3)
                names.append(name)
                dobs.append(dob)
                races.append(race)
print len(names)
print "GENDERS: "
print set(genders)
print "RACES: "
print set(races)

g = Grab()
output_file = open(sys.argv[2], 'w')

for img, name, dob, race, gender in itertools.izip(imgs, names, dobs, races,
                                                   genders):
    fields = name.split(' ')
    first = fields[0]
    numNames = len(fields)
    if len(fields[-1]) <= 3 and numNames > 2 and len(fields[numNames - 2]) > 3:
        last = fields[numNames - 2]
    else:
        last = fields[-1]
    lookup = 'http://webapps6.doc.state.nc.us/opi/offendersearch.do?method=list&searchLastName=' + last + '&searchFirstName=' + first + '&searchDOB=' + dob + '&searchDOBRange=0'
    #print lookup
    g.go(lookup)
    if g.doc.text_search(u'Nothing found'):
예제 #38
0
def main():
    print('\n-- Парсинг афиши Драмтеатра -  ' + str(datetime.now()))

    month = {
        'января': '01',
        'февраля': '02',
        'марта': '03',
        'апреля': '04',
        'мая': '05',
        'июня': '06',
        'июля': '07',
        'августа': '08',
        'сентября': '09',
        'октября': '10',
        'ноября': '11',
        'декабря': '12'
    }

    drama = Grab(timeout=20, connect_timeout=20)
    drama.go('http://quicktickets.ru/teatr-dramy-viktora-savina')

    titles = drama.doc.select(
        '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/h3')
    descriptions = drama.doc.select(
        '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/div[@class="d"]'
    )
    seanses = drama.doc.select(
        '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/div[@class="row sessions sessions-near"]'
    )

    now_month = date.today().month
    now_year = date.today().year
    next_year = now_year + 1

    #вычисляем первую дату для выборки из базы - проверка на уже загруженные даты
    start_date = drama.doc.select(
        '//div[@id="events-list"]//div[@class="item"]//div[@class="c"]/div[@class="row sessions sessions-near"]//a'
    ).text()
    start_date = start_date.replace(',', '').split(' ')
    if now_month in (10, 11, 12) and int(month[start_date[1]]) in (1, 2):
        start_date = date(next_year, int(month[start_date[1]]),
                          int(start_date[0]))
    else:
        start_date = date(now_year, int(month[start_date[1]]),
                          int(start_date[0]))
    exist_date_event = last_date_event('dramakomi', start_date)

    #отрабатываем события
    for title, desc, seans in zip(titles, descriptions, seanses):
        for date_time in seans.select('.//a'):
            date_time = date_time.text().replace(',', '').split(' ')
            time = date_time[2]
            if now_month in (10, 11, 12) and int(
                    month[date_time[1]]) in (1, 2):
                date_time = date(next_year, int(month[date_time[1]]),
                                 int(date_time[0]))
            else:
                date_time = date(now_year, int(month[date_time[1]]),
                                 int(date_time[0]))

            if exist_date_event.count(date_time.strftime("%Y-%m-%d")):
                print(date_time.strftime("%Y-%m-%d") + ' уже есть')
            else:
                event = {
                    'name': title.text(),
                    'date': date_time.strftime("%Y-%m-%d"),
                    'time': time,
                    'type_event': 'teatr',
                    'type_film': '',
                    'price': 0,
                    'source_id': 5,  #драмтеатр
                    'description': desc.text(),
                    'poster': ''
                }

                write_event_to_db(event)
예제 #39
0
__author__ = 'ipetrash'
"""Скрипт возвращает содержимое gitignore для языков программирования"""

if __name__ == '__main__':
    from grab import Grab
    g = Grab()

    lang = input("Input: ")
    g.go("https://www.gitignore.io/api/" + lang)
    print(g.response.body)
예제 #40
0
from grab import Grab
url = 'https://www.htc.com/tw/'
response = Grab().go(url)

예제 #41
0
 def create_grab_instance(self):
     return Grab(**self.grab_config)
예제 #42
0
from grab import Grab
import sys

g = Grab()

g.go('http://demo.caffe.berkeleyvision.org/classify_url?imageurl=' +
     sys.argv[1])
i = 0
for elem in g.doc.select('//ul/li/h4/a'):
    print '%s' % (elem.text())
    i = i + 1
    if i >= 5:
        break
예제 #43
0
 def init_grab(self):
     return Grab(log_dir='log', hammer_mode=True)
예제 #44
0
 def __init__(self, username, password):
     self.g = Grab()
     self.login(username, password)
예제 #45
0
 def task_generator(self):
     for query, tag in settings.QUERY_LIST:
         g = Grab()
         g.setup(url=self.build_query_url(query), content_type='xml')
         yield Task('feed', grab=g, query=query, tag=tag)
예제 #46
0
# Ленинск-Кузнецкий     lat=54&lon=86
lat = '54.65'  # Home 9  86.184826%2C54.681399
lng = '86.18'  # Home 9  lat=54.643689&lon=86.199094
#    lon : 86.17,   lat : 54.67
# Орел
lato = '53.0'
lngo = '36'

lang = 'ru'
radius = 50
types = '1,2'
appid = '******************************'

#getapiuri = 'http://narodmon.ru/api/sensorsNearby?lat=54.65&lng=86.18&radius=50&types=1,2&uuid=6ce5e6b78477f27084cc524599fc5930&api_key=09XImZqvP6g6U&lang=ru'
geturi = f'http://narodmon.ru/api/sensorsNearby?lat={lat}&lng={lng}&radius={radius}&uuid={uuid}&api_key={api_key}&lang={lang}'
wing = Grab(timeout=300)
wing.go("https://yandex.ru/pogoda/leninsk-kuznetskiy/details")
#wing.go("https://yandex.ru/pogoda/leninsk-kuznetskiy")
oblak = Grab(timeout=300)
oblak.go("https://yandex.ru/pogoda/leninsk-kuznetskiy")

WeHtm = requests.post(geturi, headers=headers).text
#print (f'http://narodmon.ru/api/sensorsNearby?lat={lat}&lng={lng}&radius={radius}&uuid={uuid}&api_key={api_key}&lang={lang}')

devd = 2
devt = -1
tra = 0
senst = 0
sensd = 1
fact = json.loads(WeHtm)
unit = fact['devices'][devt]['sensors'][senst]['unit']
예제 #47
0
from grab import Grab
import json
from datetime import datetime, timedelta

g = Grab(connect_timeout=90, timeout=90)
nowTime = datetime.now()
departures = []
arrivals = []
urlNGO = "http://www.centrair.jp/en/flight_information/today/result/"
datePattern = "%Y-%m-%d %H:%M:%S"

GO_TO_GATE = "go to gate"
CHECK_IN = "check-in"
BOARD_SOON = "board soon"
ARRIVING = "arriving"
GATE_CLOSED = "gate closed"
FINAL_CALL = "final call"

ARRIVED = "arrived"
LATE = "late"
LAST_CALL = "last call"
GATE_OPEN = "gate open"

SCHEDULED = "scheduled"
DELAYED = "delayed"
CANCELLED = "cancelled"
CHECKIN = "checkin"
BOARDING = "boarding"
OUTGATE = "outgate"
DEPARTED = "departed"
EXPECTED = "expected"
예제 #48
0
from grab import Grab, UploadFile

import logging

logging.basicConfig(level=logging.DEBUG)
g = Grab()
g.setup(log_dir='log/grab')
g.go('https://afisha.tut.by/film/', log_file='out.html')
g.setup(post={'hi': u'Превед, яндекс!'})
g.request()
예제 #49
0
파일: a_7.py 프로젝트: nicenicenice/parsers
from grab import Grab
import json
from datetime import datetime, timedelta
import re

#17:20 - 19:20
g = Grab(connect_timeout=90, timeout=90)
g.setup(headers={"X-Requested-With": "XMLHttpRequest"})

CHECK_IN = "check-in"
BOARDING_CLOSED = "boarding closed"
ON_TIME = "on time"
CANCELED = "canceled"
AIRBORNE = "airborne"
ESTIMSTED = "estimated"

ARRIVED = "arrived"
LATE = "late"
LAST_CALL = "last call"
GATE_CLOSED = "gate closed"
FINAL_CALL = "final call"
GATE_OPEN = "gate open"

SCHEDULED = "scheduled"
DELAYED = "delayed"
CANCELLED = "cancelled"
CHECKIN = "checkin"
BOARDING = "boarding"
OUTGATE = "outgate"
DEPARTED = "departed"
EXPECTED = "expected"
예제 #50
0
from grab import Grab
import logging

logging.basicConfig(level=logging.DEBUG)
g = Grab()
g.go('http://habrahabr.ru')
g.xpath('//h2/a[@class="topic"]').get('href')

print(g.xpath_text('//h2/a[@class="topic"]'))
print(g.css_text('h2 a.topic'))
print('Comments:', g.css_number('.comments .all'))
from urllib.parse import urlsplit

print(', '.join(urlsplit(x.get('href')).netloc for x in g.css_list('.hentry a') if
                not 'habrahabr.ru' in x.get('href') and x.get('href').startswith('http:')))
예제 #51
0
파일: cookies.py 프로젝트: artemzi/grab
 def test_cookies_parsing(self):
     g = Grab(transport=GRAB_TRANSPORT)
     SERVER.RESPONSE['cookies'] = {'foo': 'bar', '1': '2'}
     g.go(SERVER.BASE_URL)
     self.assertEqual(g.response.cookies['foo'], 'bar')
예제 #52
0
파일: nic_ua.py 프로젝트: ozamodaz/payless
from grab import Grab
g = Grab()


def nic_ua():
    prices = {}
    g.go('http://nic.ua/ukr/tariffs.html')
    repl = (
        (',', '.'),  # swich separator for float() conversion
        ('\xa0', ''),  # remove space separating thousands
        ('Безкоштовно', '0.0'),
        ('—', '0.0'))
    for element in g.css_list('.domain-name'):
        tld = element.text_content()  # lower case without dot
        price = element.getparent().getparent().getnext().text_content()
        price = price.strip(' \xa0₴\n')
        for i in repl:
            price = price.replace(*i)
        prices[tld] = float(price)
    return prices


if __name__ == '__main__':
    prices = nic_ua()
    for tld in prices:
        print('{:<20s}{:>8} '.format(tld, prices[tld]))
    print(len(prices))
# 304 TLDs
예제 #53
0
파일: cookies.py 프로젝트: artemzi/grab
    def test_load_dump(self):
        g = Grab(transport=GRAB_TRANSPORT)
        cookies = {'foo': 'bar', 'spam': 'ham'}
        g.setup(cookies=cookies)
        g.dump_cookies(TMP_FILE)
        self.assertEqual(set(cookies.items()),
                         set(json.load(open(TMP_FILE)).items()))

        # Test non-ascii
        g = Grab(transport=GRAB_TRANSPORT)
        cookies = {'foo': 'bar', 'spam': u'бегемот'}
        g.setup(cookies=cookies)
        g.dump_cookies(TMP_FILE)
        self.assertEqual(set(cookies.items()),
                         set(json.load(open(TMP_FILE)).items()))

        # Test load cookies
        g = Grab(transport=GRAB_TRANSPORT)
        cookies = {'foo': 'bar', 'spam': u'бегемот'}
        json.dump(cookies, open(TMP_FILE, 'w'))
        g.load_cookies(TMP_FILE)
        self.assertEqual(set(g.config['cookies'].items()),
                         set(cookies.items()))
예제 #54
0
파일: antigate.py 프로젝트: sergithon/grab
 def get_check_solution_request(self, captcha_id):
     params = {'key': self.api_key, 'action': 'get', 'id': captcha_id}
     url = 'http://antigate.com/res.php?%s' % urlencode(params)
     g = Grab()
     g.setup(url=url)
     return g
예제 #55
0
def logWebPages(attribute):
    g = Grab()
    g.go('http://horo.mail.ru/prediction/' + attribute + '/today/',
         log_file='logs/' + attribute + '.txt')
예제 #56
0
 def setUp(self):
     # Create fake grab instance with fake response
     self.g = Grab(HTML, charset='cp1251')
예제 #57
0
from grab import Grab
import pyexcel
url = 'http://ruticker.com/ReportTopOrders?ticker=siz4&bigPeriod=1'
g = Grab()
g.setup(post={'username': "******", 'password': "******"})
g.go(url)
a = []
b = []
for i in g.doc.select("//tr/td"):
    a.append(i.text())

with open('big_trades.xls', 'wt') as f:
    for elem in a:
        b.append(' '.join(str(el) for el in a[:5]))
        f.write(''.join(','.join(str(el) for el in a[:5])))
        f.write(u'\n')
        del a[:5]
예제 #58
0
파일: a_6.py 프로젝트: nicenicenice/parsers
        result["estimated"] = estimated
    if actual != "":
        result["actual"] = actual
    if gate != "":
        result["gate"] = gate
    if check_in_desks != "":
        result["check_in_desks"] = check_in_desks
    return result


def getTimeStampFromDateTime(datetime):
    return int(time.mktime(datetime.timetuple()) + datetime.microsecond / 1E6)


# ARRIVE
g = Grab(connect_timeout=90, timeout=90)

urlARH = "http://arhaero.ru/ajaxonlinetablo.php"

# Yesterday
yesterdayTime = nowTime - timedelta(days=1)
yesterdayTimeStamp = getTimeStampFromDateTime(yesterdayTime)
todayArrivalsParams = {"date": yesterdayTimeStamp, "type": "arrival"}

resp = g.go(urlARH, post=todayArrivalsParams)

i = 0
for el in g.doc.select("//table/tbody/tr"):
    i += 1
    if i == 1:
        continue
예제 #59
0
 def test_flask2(self):
     g = Grab(transport=GRAB_TRANSPORT)
     g.go(BASE_URL + '?foo=5')
     self.assertEqual(REQUEST['args']['foo'], '5')
예제 #60
0
import json
from time import sleep
import random
from urllib import parse
from grab import Grab

base_url = 'https://api.crossref.org/works?'
# filter for parsing only articles published in journals, have abstracts and published until 2019
# for more information about filters see manual: https://github.com/CrossRef/rest-api-doc#filter-names
filter = 'filter=type:journal-article,has-abstract:t,until-pub-date:2019'
ppath = os.path.dirname(sys.argv[0])  # path to the script location
paper_collection = ppath + '/../texts/papers_crossref.txt'  # path to file for saving parsed data
ud_dois = ppath + '/../texts/ud_dois.txt'  # # path to file with list of DOI to saved papers

# initialization and settings of grab object
g = Grab(log_file=ppath + '/../temp/out_crossref.html')
g.setup(cookiefile=ppath + '/../temp/cookies_pars.txt',
        reuse_referer='True',
        timeout=120)
g.setup(user_agent='CitePrediction/0.1_alpha; mailto:[email protected]')

# reading used DOIs from file and generating list with used DOIs for exclusion them from double retrieving
used_dois = []
with open(ud_dois, 'r') as dois:
    used_dois = dois.readlines()
used_dois = [x.strip() for x in used_dois]

#url = base_url+filter+'&rows=1000'+'&cursor=*' # parsing papers consequentially. Busting the same DOIs every time the script staring if cursor will not saved
url = base_url + filter + '&sample=100'  # random sample of papers from database. As much papers will parsed, as much doubles it got

# first batch of papers