Exemplo n.º 1
0
def get_page_childs(parent_url):
    virtual_browser = Grab()
    urls = []
    page = open_page(virtual_browser, parent_url)

    if page is False:
        return urls

    all_urls = page.select(SELECTOR)
    for url in all_urls:
        link = re.search('href=(\S+)', url.html())
        link = link.group(0).split('"')[1]
        if link.startswith('/'):
            link = initial_url + link

        urls.append({'link': link, 'parent': parent_url})
    return urls
Exemplo n.º 2
0
    def test_task_url_and_grab_options(self):
        class TestSpider(Spider):
            def setup(self):
                self.done = False

            def task_page(self, grab, task):
                self.done = True

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        g = Grab()
        g.setup(url=self.server.get_url())
        self.assertRaises(SpiderMisuseError,
                          Task,
                          'page',
                          grab=g,
                          url=self.server.get_url())
Exemplo n.º 3
0
def main(tag):
    pageId = 0
    f = open(tag + '.txt', 'w')
    f.write(tag + ':\n')
    while True:
        g = Grab()
        g.setup(timeout=60, connect_timeout=60)
        pageId += 1
        g.go(p(tag, pageId))
        v1 = g.xpath_text('//title')
        v2 = unicode("Хабрахабр — страница не найдена (404)", 'utf-8')
        if v1 == v2:
            print 'Finished at page: ' + str(pageId) + '!'
            break
        for questionText in g.xpath_list('//a[@class="post_title"]'):
            f.write(smart_str(questionText.text_content().strip()) + '\n')
        print 'Page # ' + str(pageId) + ' parsed!'
Exemplo n.º 4
0
 def _do_fetch(self, title, artist):
     g = Grab()
     search_response = g.go("http://www.google.com/search?q=%s" %
                            urllib.quote("site:nashe.com.ua %s %s" %
                                         (artist, title))).body
     x1 = search_response.find("http://nashe.com.ua/song.htm?id=")
     if x1 == -1:
         return []
     x2 = search_response.find(r'"', x1)
     link = search_response[x1:x2].replace("&", "&")
     response = g.go(link).body
     x1 = response.find("'song2'>") + 8
     x2 = response.find("</div>", x1)
     lyrics = html2text.html2text(response[x1:x2].decode("cp1251")).replace(
         "\n\n", '\n')
     sr = [lyrics]
     sr = map(lambda x: u"%s \nSource: nashe.com.ua" % x, sr)
     return sr
Exemplo n.º 5
0
def test_domria():

    ua = UserAgent()
    grab = Grab(timeout=30,
                connect_timeout=10,
                log_file='%s/vparser/tmp/pars/log.html' %
                os.path.split(PROJECT_PATH)[0])
    grab.setup(proxy='46.148.30.216:8080',
               proxy_type='http',
               proxy_userpwd=CREDENTIALS_box)  # , log_dir='vparser/tmp'
    # grab.go('http://kiev.ko.slando.ua/obyavlenie/predlagaetsya-v-arendu-posutochno-v-kieve-kvartira-odnokomnatnaya-po-ulits-ID75E19.html#a025724d26')
    grab.go(
        'http://dom.ria.com/ru/realty_prodaja_dom_harkov_olhovka_stepnaya_ulitsa-8253714.html'
    )
    # grab.go('http://kharkov.kha.slando.ua/nedvizhimost/arenda-kvartir/')

    print grab.doc.select(
        '//div[@class="item-param"]/strong[@class="phone"]').text()
Exemplo n.º 6
0
    def task_initial(self, grab, task):

        selector = '//div[@class="entry unvoted"]/ul/li[@class="first"]/a[contains(@class,"comments")]'
        for post in grab.doc.select(selector):
            post_link = grab.make_url_absolute(post.attr("href"))
            grab_custom = Grab()
            grab_custom.setup(
                user_agent=
                "User-agent:Linux:Subreddits-Scraper:1.0 by /u/kadze_yukii",
                url=post_link)
            self.add_task(Task('post', grab=grab_custom))

        try:
            next_page = grab.make_url_absolute(
                grab.doc.select('//a[@rel="nofollow next"]').attr("href"))
            self.add_task(Task('initial', url=next_page))
        except:
            pass
Exemplo n.º 7
0
 def task_generator(self):
     grab = Grab()
     grab.load_proxylist(PROXY_PATH, 'text_file',
                         proxy_type='http', auto_init=False, auto_change=True)
     for link in VOCABULARY:
         url = link['url']
         pages = xrange(1, link['pages'])
         cat = link['cat']
         for page in pages:
             grab.change_proxy()
             grab.setup(
                 url=url % page,
                 proxy_userpwd=CREDENTIALS,
                 hammer_mode=True,
                 hammer_timeouts=((2, 5), (10, 15), (20, 30)),
                 reuse_cookies=True
             )
             yield Task('link_on_page', grab=grab, cat=cat)
Exemplo n.º 8
0
 def set_redirect_for_mobile_devices(self, request):
     #Здесь определять мобильный девайс и при необходсмости менять домен на мобильный
     headers = dict()
     for m in request.META:
         if m.startswith(
                 'HTTP_'
         ) and not m == 'HTTP_HOST' and not m == 'HTTP_CONNECTION':
             headers[m.replace('HTTP_',
                               '').lower().replace('_',
                                                   '-')] = request.META[m]
     get_params = urllib.urlencode(headers)
     g = Grab()
     g.go('http://phd.yandex.net/detect/?%s' % get_params)
     if g.doc.select('//yandex-mobile-info').exists():
         if not self.redirect_to:
             self.redirect_to = 'm.%s/' % request.get_host()
         else:
             self.redirect_to = 'm.%s' % self.redirect_to
Exemplo n.º 9
0
 def _do_fetch(self, title, artist):
     g = Grab()
     search_response = g.go("http://www.google.com/search?q=%s" %
                            urllib.quote("site:textypesen.com.ua %s %s" %
                                         (artist, title))).body
     x1 = search_response.find("http://textypesen.com.ua/")
     if x1 == -1:
         return []
     x2 = search_response.find(r'"', x1)
     link = search_response[x1:x2].replace("&amp;", "&")
     response = g.go(link).body
     x1 = response.find(">", response.find("align=right")) + 1
     x2 = response.find("<table", x1)
     lyrics = html2text.html2text(response[x1:x2].decode("utf8")).replace(
         "\n\n", '\n')
     sr = [lyrics]
     sr = map(lambda x: u"%s \nSource: textypesen.com.ua" % x, sr)
     return sr
Exemplo n.º 10
0
def getProductsImages(url):
	from grab import Grab
	g = Grab(log_file='productsImages.html')
	g.go(url)

	products = g.doc.select('//div/div/div/div/div[@class="catalog w"]/div[@class="items fix"]/div[@class="item"]/h3/a')
	products_href = []
	products_images = []      
	for elem in products:
		href = elem.attr('href')
		href = 'http://moscross.ru/'+href
		products_href.append(href)

	for elem in products_href:
		products_images.append(getProductImages(elem))


	return products_images
Exemplo n.º 11
0
    def task_generator(self):
        logging.debug("*****execute******")
        with open('directories.csv', 'rb') as f:
            content = csv.reader(f)
            directories = list(content)

        # directories = ['google']
        total = len(directories)
        logging.debug("*****{}******".format(total))
        i = 100
        total = 102
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        }

        test_url = 'https://www.google.com'

        while (True):
            logging.debug("Index: {}".format(i))
            if i >= total:
                break
            g = Grab()
            g.clear_cookies()
            g.setup(**config)
            g.setup(headers=headers)
            logging.debug("CONFIG : {}".format(g.config))
            data = dict(slug=directories[i][0], )
            logging.info(data)
            while True:
                try:
                    print "------------------------"
                    g.go(test_url)
                    print g.doc.body
                    print "++++++++++++++++++++++++"
                    break
                except Exception as e:
                    print "************************"
                    logging.debug(e)
                    time.sleep(1)

            yield Task('init', grab=g, data=data)
            time.sleep(5)
            i += 1
Exemplo n.º 12
0
    def check_in_circle(self, url, token_id, loyalty_id):
        in_circle = False
        self.refresh_token(token_id)

        soc_token = SocToken.query.get(token_id)
        username = self.parse_username(url)
        request_url = "%s%s%s?key=%s" % (
            self.API_PATH,
            self.API_PARTS['people_list'],
            username,
            SocConfig.GOOGLE_KEY)
        taget_user = request_helper.make_request(request_url, True)

        if 'id' in taget_user and taget_user['id']:
            user_id = taget_user['id']
            g = Grab()
            g.setup(headers={'Authorization': 'Bearer ' + soc_token.user_token})

            url_api = self.API_PATH + self.API_PARTS['peoples']

            while not in_circle:
                g.go(url_api)
                circle = json.loads(g.response.body)

                if 'items' not in circle:
                    break
                if len(circle['items']) <= 0:
                    break

                for friend in circle['items']:
                    if 'id' in friend and friend['id'] == user_id:
                        in_circle = True

                if 'nextPageToken' not in circle:
                    break
                if len(circle['nextPageToken']) <= 0:
                    break

                url_api = "%s%s&pageToken=%s" % (
                    self.API_PATH,
                    self.API_PARTS['peoples'],
                    circle['nextPageToken'])

        return in_circle
Exemplo n.º 13
0
def download_fb2(links, dir, limit):
    for link in links:
        g = Grab()
        g.go(link)
        genre = g.doc.select('//*[@id="main"]/h1').text()
        # try:
        #     os.makedirs(genre)
        # except OSError:
        #     pass
        book_links = g.doc.select('//*[@id="main"]/form/ol/a')
        i = 0  # add limit counter

        for book_link in book_links:
            if i == limit:
                break
            link = 'http://flibusta.net%s/fb2' % book_link.attr('href')
            name = book_link.text()
            book_url = urlopen(link)
            book = book_url.read()
            try:
                book.decode('utf-8')
                print('Book %s is blocked' % name)
                continue
            except UnicodeDecodeError:
                pass

            completeName = os.path.join("{0}\\{1}\\{2}.zip".format(
                dir, genre, name))

            #if timout error, do it again
            while True:
                try:
                    f = open(completeName, 'wb')
                    f.write(book)
                    f.close()
                    i += 1
                    print('downloaded: ' + name)
                except requests.exceptions.RequestException as e:  # This is the correct syntax
                    print(e)
                    continue
                break

        else:
            print('Finished %s' % genre)
Exemplo n.º 14
0
    def get_html(URL):  #получили страницу
        g = Grab(
            url=URL,
            user_agent=
            "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11",
            timeout=8)

        try:
            response = g.request()
            time.sleep(2)
            # response = urllib.request.urlopen(URL, timeout=8)
            logging.info('получили какой то response')
            # return response.read()
            return response.unicode_body()
        except:
            logging.info(
                'Сервер не ответил за 8 секунд, попробуем еще раз!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
            )
            return get_html(URL)
Exemplo n.º 15
0
 def get_html(URL):  # получили страницу
     g = Grab(
         url=URL,
         user_agent=
         "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11",
         timeout=8)
     try:
         response = g.request()
         # response = urllib.request.urlopen(URL, timeout=8)
         logging.info(
             'запросили страницу со списком объявлений и сделали суп')
         # return response.read()
         time.sleep(2)
         return response.unicode_body()
     except:
         logging.warning(
             'сервер не ответил вовремя по запросу списка объявлений, пробуем еще раз!!!!!!!!!!!!!!!!!'
         )
         return get_html(URL)
def scraiping_add():
    # Search link on the foto
    pointer = Grab()
    pointer.setup(timeout=25, connect_timeout=25)
    pointer.go('http://www.photosight.ru/photos/6599649/?from=best')
    response_url_foto = pointer.doc.select('//img[@id = "big_photo"]/@src').text()
    response_text_foto = pointer.doc.select('//img[@id = "big_photo"]/@alt').text()

    # Load foto on the PC
    urllib.request.urlretrieve(response_url_foto, 'image.png')

    # Load foto on the Server
    pointer.go('https://ourfoto.herokuapp.com/add_foto/')
    pointer.doc.set_input('image', UploadFile('image.png'))
    pointer.doc.set_input('text', response_text_foto)
    pointer.doc.submit()

    # Delete foto
    os.remove('image.png')
Exemplo n.º 17
0
    def __init__(self,
                 key,
                 filename='',
                 auto_run=True,
                 grab_config=None,
                 send_config=None,
                 domain='antigate.com'):
        self.g = Grab()
        if grab_config:
            self.g.setup(**grab_config)
        self.key = key
        self.captcha_id = None
        self.captcha_key = None
        self.send_config = send_config
        self.domain = domain
        self.logger = getLogger(__name__)

        if auto_run and filename:
            self.run(filename)
Exemplo n.º 18
0
def make_new_link(url):
    new_grab = Grab()
    new_link = ''
    if 'linkedin.com/in/' in url:
        return  url
    if 'linkedin.com/profile/'in url:
        return  url
    else:
        try:
            new_grab.go(url)
            head = str(new_grab.response.head).split('\\r\\')
            for x in head:
                if 'https' in x:
                    new_link = x.replace('https://ua.', 'https://www.').replace(" ", "")\
                        .replace('nLocation:', '').strip(" ")\
                        .replace('https://pl.', 'https://www.')
        except:
            new_link = url
        return new_link
Exemplo n.º 19
0
def get_categories():
    grab = Grab()
    grab.setup(url='http://www.free-lance.ru/freelancers/')

    print u'Запрос страницы'
    grab.request()

    print u'Извлечение категорий'
    categories = grab.xpath_list('//ul[@id="accordion"]/li[not(@class)]')

    for category in categories:
        subcategories = category.xpath('./ul[@class="element"]/li/span/a')
        subcategories = map(lambda a: a.text_content().encode('utf-8'),
                            subcategories)

        yield (category.xpath('./a')[0].text_content().encode('utf-8'),
               subcategories)

    print u'Завершено'
Exemplo n.º 20
0
 def test_setup_proxylist(self):
     with temp_file() as proxy_file:
         content = '\n'.join(x['proxy']
                             for x in self.extra_servers.values())
         with open(proxy_file, 'w') as out:
             out.write(content)
         # Simple test, one task
         bot = build_spider(SimpleSpider, thread_number=1)
         bot.load_proxylist(proxy_file, 'text_file')
         bot.setup_queue()
         bot.add_task(
             Task('baz', grab=Grab(url='http://yandex.ru', debug=True)))
         bot.run()
         serv = [
             x['server'] for x in self.extra_servers.values()
             if x['server'].request['done']
         ][0]
         self.assertEqual(serv.request['headers']['host'], 'yandex.ru')
         self.assertEqual(1, len(set(bot.stat.collections['ports'])))
Exemplo n.º 21
0
    def test_task_url_and_grab_options(self):
        class TestSpider(Spider):
            def setup(self):
                # pylint: disable=attribute-defined-outside-init
                self.done = False

            def task_page(self, dummy_grab, dummy_task):
                # pylint: disable=attribute-defined-outside-init
                self.done = True

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        grab = Grab()
        grab.setup(url=self.server.get_url())
        self.assertRaises(SpiderMisuseError,
                          Task,
                          'page',
                          grab=grab,
                          url=self.server.get_url())
Exemplo n.º 22
0
    def task_initial(self, grab, task):
        # Add to all <br> - '\n'
        raw_br_list = grab.xpath_list('//br')
        for item in raw_br_list:
            item.text = "\n"
        raw_text = grab.xpath_text('//*')

        ip_port_list = re.findall('[0-9]+[.][0-9]+[.][0-9]+[.][0-9]+[:][0-9]+',
                                  raw_text)

        for ip in ip_port_list:
            grab = Grab()
            grab.setup(url='http://www.icanhazip.com')
            grab.setup(proxy=ip,
                       proxy_type='http',
                       connect_timeout=10,
                       timeout=15)
            info = {'server': ip, 'type': 'http'}
            yield Task('proxy_check', grab=grab, info=info)
Exemplo n.º 23
0
    def handle(self, *args, **options):
        g = Grab(log_dir="/tmp")
        site = "http://20k.com.ua"
        results = ''

        # class  FirmHotline(models.Model):
        # class  ScanHotline(models.Model):
        # class OneHotline(models.Model):
        # class ConcurentHotline(models.Model):

        def make_url(url):
            items = url.split('/')
            del items[-2]
            nurl = '/'.join(items)
            g.go(site + nurl)
            if g.response.code == 200:
                return nurl
            else:
                return make_url(nurl)

            # assert False, nurl

        f = open(PROJECT_ROOT + '/static/error.csv', 'r')
        # assert False, f.split('/n')
        for line in f:
            url = line.split(',')[0]
            g.go(url)
            if g.response.code == 404:
                if not '.jpg' in url:
                    l = url.split('http://20k.com.ua')[1]
                    sl = l.split('?')[0]
                    if 'index.php' in l:
                        n = '/'
                    else:
                        n = make_url(sl)

                    results += l + ',' + n + '\n'

                    self.stdout.write(l + ',' + n)

        f = open(PROJECT_ROOT + '/static/results.csv', 'w')
        f.write(results)
        f.close()
Exemplo n.º 24
0
def get_page_status(page):
    url = page['link']

    for bad_url in exclude_urls:
        if bad_url in url:
            return False

    virtual_browser = Grab()
    check = open_page(virtual_browser, url)
    if check is not False and "200 OK" not in check.status:
        is_visible = check_with_selenium(unicode(page.get('parent')),
                                         unicode(url))
        if is_visible:
            write_result(
                unicode("{0} {1} parent page: {2}").format(
                    unicode(check.status), unicode(url),
                    unicode(page.get('parent'))))
            return False
    return True
 def __init__(self, master=None):
     tk.Frame.__init__(self, master)
     self.grid()
     self.g = Grab()
     logging.basicConfig(level=logging.DEBUG)
     if self.get_api_key():
         self.api_key = self.get_api_key()
     else:
         self.api_key = 'No key'
     self.createWidgets()
     if not os.path.isfile('weatherapp.db'):
         self.db_connect()
     self.country_list = []
     self.country_selected = ''
     self.chosen_city = ''
     self.chosen_city_id = ''
     self.temp = 0
     self.date = date.today().isoformat()
     self.weather_id = ''
Exemplo n.º 26
0
    def test_charset_html5(self):
        grab = Grab()
        grab.setup_document(b"<meta charset='windows-1251'>")
        self.assertEqual('windows-1251', grab.doc.charset)

        grab.setup_document(b'<meta charset="windows-1252">')
        self.assertEqual('windows-1252', grab.doc.charset)

        grab.setup_document(b'<meta charset=latin-1>')
        self.assertEqual('latin-1', grab.doc.charset)

        grab.setup_document(b"<meta charset  =  'windows-1251'  >")
        self.assertEqual('windows-1251', grab.doc.charset)

        grab.setup_document(b'<meta charset  =  "windows-1252"   >')
        self.assertEqual('windows-1252', grab.doc.charset)

        grab.setup_document(b'<meta charset  =  latin-1  >')
        self.assertEqual('latin-1', grab.doc.charset)
Exemplo n.º 27
0
    def check_plus(self, url, token_id, loyalty_id):
        plused = False
        self.refresh_token(token_id)

        action = PaymentLoyalty.query.get(loyalty_id)
        target = json.loads(action.data)
        soc_token = SocToken.query.get(token_id)
        g = Grab()
        # g.setup(headers={'Authorization':'Bearer ' + soc_token.user_token})
        url_api = "%s%s%s%s&key=%s" % (
            self.API_PATH,
            self.API_PARTS['activities'],
            target['id'],
            self.API_PARTS['plusoners'],
            SocConfig.GOOGLE_KEY)

        while not plused:
            g.go(url_api)
            plusoners = json.loads(g.response.body)

            if 'items' not in plusoners:
                break
            if len(plusoners['items']) <= 0:
                break

            for person in plusoners['items']:
                if 'id' in person and person['id'] == soc_token.soc_id:
                    plused = True

            if 'nextPageToken' in plusoners:
                break
            if len(plusoners['nextPageToken']) <= 0:
                break

            url_api = "%s%s%s%s&pageToken=%s&key=%s" % (
                self.API_PATH,
                self.API_PARTS['activities'],
                target['id'],
                self.API_PARTS['plusoners'],
                plusoners['nextPageToken'],
                SocConfig.GOOGLE_KEY)

        return plused
Exemplo n.º 28
0
    def handler(self,
                collection,
                obj,
                set_field,
                base_dir,
                task_args=None,
                grab_args=None,
                callback=None):
        from database import db

        for image in obj.get(set_field, []):
            path = hashed_path(image['url'], base_dir=base_dir)
            if os.path.exists(path):
                if path != image['path']:
                    db[collection].update(
                        {
                            '_id': obj['_id'],
                            ('%s.url' % set_field): image['url']
                        }, {'$set': {
                            '%s.$.path': path
                        }})
            else:
                kwargs = {}
                if task_args:
                    kwargs = deepcopy(task_args)

                g = Grab()
                g.setup(url=image['url'])
                if grab_args:
                    g.setup(**grab_args)
                g.setup(referer=build_image_hosting_referer(image['url']))

                yield Task(callback=callback or image_set_handler,
                           grab=g,
                           collection=collection,
                           path=path,
                           obj=obj,
                           image=image,
                           set_field=set_field,
                           disable_cache=True,
                           backup=g.dump_config(),
                           **kwargs)
Exemplo n.º 29
0
    def task_jsonresponse(self, grab, task):

        try:
            response = json.loads(
                str(grab.doc.body).replace("b'",
                                           "").replace("'",
                                                       "").replace('\\', ''))
        except:
            response = None
            self.add_task(
                Task('jsonresponse', url=task.url, delay=1,
                     region=task.region))
            print('----- Ответ не похож на json -----', task.url)

        if response:
            if task.url == self.starturl:
                response = response[0]['children']
            for resp in response:
                temp_id = resp['id']
                temp_intid = resp['a_attr']['intid']
                temp_levelid = resp['a_attr']['levelid']

                if temp_levelid == '8':
                    g = Grab(url=self.urlpattern_page + temp_intid +
                             '?do=result',
                             document_charset='windows-1251')
                    yield Task('pageresponse',
                               grab=g,
                               urlid=temp_id,
                               region=task.region)
                elif temp_levelid == '11':
                    pass
                else:
                    yield Task('jsonresponse',
                               url=self.urlpattern_json + temp_id,
                               region=task.region)
                    print(self.counter, 'Отправлена в работу ссылка', temp_id,
                          'уровень', temp_levelid)
                    self.counter += 1

        else:
            print('----- Похоже пустой json -----', task.url)
Exemplo n.º 30
0
def scan(starting_url):
    g = Grab()
    urls_queue = collections.deque()
    urls_queue.append(starting_url)
    found_urls = set()
    found_urls.add(starting_url)
    visited_urls = set()
    #cn = 1
    while len(urls_queue):
        url = urls_queue.popleft()
        try:
            g.go(url)
            if g.response.code < 400 and g.response.headers[
                    'Content-Type'].find(
                        'text/html') != -1 and g.response.url.startswith(
                            starting_url):
                #print(str(cn) + '. ' + url)
                #cn += 1
                print(url)
                links = g.doc.select('//a[@href]')
                for link in links:
                    href = prep(link.attr('href'))
                    if href == False:
                        continue
                    if href.startswith('http') == False:
                        href = starting_url + '/' + href
                    elif href.startswith('http') and href.startswith(
                            starting_url) == False:
                        continue
                    if href not in found_urls:
                        found_urls.add(href)
                    else:
                        continue
                    if url not in visited_urls:
                        urls_queue.append(href)
                visited_urls.add(url)
            elif url in found_urls:
                found_urls.remove(url)
        except:
            pass
        time.sleep(0.2)
    return found_urls