Exemplo n.º 1
0
 def down(link_, max: int = 1):
     try:
         if max > 5:
             return
         print(link_[0])
         link_down = link_[1]
         link_down = link_down if (
             'http' or 'https') in link_down else "https:" + link_down
         r = req_link(link_down)
         b = Bs(r.content, 'html.parser')
         nome = b.find("h1", itemprop='name').text.split()[-1] + '-'
         nome += b.find("h2", itemprop='alternativeHeadline').text
         bb = b.find("source")
         if bb:
             ll = bb.get("src") if (
                 'http'
                 or 'https') in bb.get('src') else "https:" + bb.get('src')
             r = ll
             return {nome: r}
         else:
             bb = b.find('a', title="Baixar Video")
             if bb:
                 r = req_link(
                     bb.get("href") if ('http' or 'https'
                                        ) in bb.get('href') else "https:" +
                     bb.get('href'))
                 b = Bs(r.content, 'html.parser')
                 bb = b.find("a", "bt-download")
                 if bb:
                     head = bb.get("href") if ('http' or 'https') in bb.get(
                         'href') else "https:" + bb.get('href')
                     return {nome: head}
     except Exception as e:
         print("Down", link_[0], e)
         return down(link_, max + 1)
Exemplo n.º 2
0
def get_organization_page(domain, proxy=None):
    global HEADERS
    global BASE_URL

    if proxy is not None:
        proxies = {f'{proxy.split(":")[0].strip()}': proxy}
    else:
        proxies = None

    response = requests.get(f"{BASE_URL}/scholar?q={domain}",
                            headers=HEADERS,
                            proxies=proxies)

    logger.info(f'{response.status_code} {response.reason}')

    tree = Bs(response.content, "lxml")
    org_href = get_org_href(tree)

    if not org_href:
        return None, None

    response = requests.get(f"{BASE_URL}{org_href}",
                            headers=HEADERS,
                            proxies=proxies)

    if not response.ok:
        return None, None

    tree = Bs(response.content, "lxml")

    return tree, org_href
    def get_parser_body(self, ref):
        """Returns 4 thousand characters of the text of the news in response"""

        result = str()

        if "ria" in self.url:
            source = requests.get(ref).text
            soup = Bs(source, 'lxml')
            raw_result = soup.find_all('div', class_='article__text')

            for i in raw_result:
                result = result + i.text + ' '

        elif "interfax" in self.url:
            source = requests.get(f'{self.url}{ref}')
            source.encoding = 'cp1251'
            source = source.text
            soup = Bs(source, 'lxml').find('div', class_='mainblock')
            raw_result = soup.find_all_next('p')

            for i in raw_result:
                result = result + i.text + ' '

        elif "tass" in self.url:
            source = requests.get(f'{self.url}{ref}').text
            soup = Bs(source, 'lxml')
            raw_result = soup.find_all('div', class_='text-content')
            result = result + raw_result[0].text

        else:
            pass

        return result[:4000]
Exemplo n.º 4
0
def bypass(link):
    print(" *Bypassing please wait...")
    for _ in range(5):
        try:
            req = ses.post("https://nuubi.herokuapp.com/pahe/bypass",
                           data={'url': link})
            url = re.findall(r"href='(.*?)'", req.text)[0]
        except Exception as er:
            print(f"Oops, {er}, mengulangi lagi...")
            time.sleep(3)
            continue
        else:
            break

    req2 = ses.get(url)
    bs = Bs(req2.text, 'html.parser')
    url2 = bs.find('a', {'class': 'btn btn-primary btn-xs'})['href']

    reqs = ses.get(url2)
    link = re.findall("href='(.*?)'>Download ", reqs.text)[0]

    reqs_ = ses.get(link)
    time.sleep(2)
    reqs2 = ses.get(link)
    bs2 = Bs(reqs2.text, 'html.parser')
    dlink = bs2.find('a', {'title': 'Download'})['href']

    download(dlink,
             f"{info['title'][pil-1][1]} ({info['resu'][lih-1][1]}).mkv")
Exemplo n.º 5
0
def parse_for_stories(html_page):
    # parses an html page (of a story).
    # should work offline. excpet if there are multiple pages
    # then it would attepmt to fetch those.

    header = []
    pages = []

    soup = Bs(html_page, parser)
    head = soup.find("div", attrs={"class": "b-story-header"}).text
    body = soup.find("div", attrs={"class": "b-story-body-x"}).text

    parse_header = re.search(r"([\w\s'.,/-]+)\n\s*by([\w\s',/-]+)", head)
    if parse_header:
        header.append(parse_header.group(1))
        header.append(parse_header.group(2))

    header.append(head)
    pages.append(body)

    while True:
        next_page = soup.find("a", attrs={"class": "b-pager-next"})
        if next_page:
            new_page = requests.get(next_page["href"])
            soup = Bs(new_page.text, "html5lib")
            body = soup.find("div", attrs={"class": "b-story-body-x"}).text
            pages.append(body)

        else:
            break

    story_object = {"title": header[0], "author": header[1], "pages": pages}
    return story_object
Exemplo n.º 6
0
 def getsearchHotelList(self, citycode):
     # 获取总页数
     totalpage = self.gettotalpage(self.citycode)
     unreadlist = []
     loop = range(1, (int(totalpage) + 1))
     # 逐页查询
     for x in loop:
         try:
             # 打开第x页
             html = requests.get(self.mainUrl + str(citycode) + "/p" +
                                 str(x))
             print "page " + str(x) + " is Searching"
             bsObj = Bs(html.text, 'lxml')
             hotel_list = bsObj.find("div", {
                 "id": "hotel_list"
             }).findAll("div", {"class": "hotel_new_list"})
             # 异步请求方式爬取
             # param = {"StartTime": "2018-05-22", "DepTime": "2018-06-19", "cityName": "广州",
             #          "RoomGuestCount": "1,1,0",
             #          "IsOnlyAirHotel": "F", "cityId": 32, "cityCode": 020, "page": x}
             # list = "http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx"
             # html = requests.post(list, data=param, headers=self.header)
             # bsObj = Bs(html.text.replace("\\", ""), 'lxml')
             # hotel_list = bsObj.findAll("div", {"class": "hotel_new_list"})
             # 数据转换
             hotel_total_list = self.bsobj_to_hotelobj(hotel_list)
             # 批量插入至数据库
             ch.insert(hotel_total_list)
         except Exception as e:
             print "Something goes wrong:"
             print e
             print "Wait a Minute,just sleep for a while"
             print "Zzzz"
             sleep(10)
             # 未读取列表
             unreadlist.append(x)
             print "Go on~"
             continue
     # 未查询成功的数据
     for i in unreadlist:
         try:
             # 打开第x页
             newse = requests.session()
             html = newse.get(self.mainUrl + str(self.citycode) + "/p" +
                              str(i))
             print "page " + str(i) + " is Searching"
             bsObj = Bs(html.text, 'lxml')
             hotel_list = bsObj.find("div", {
                 "id": "hotel_list"
             }).findAll("div", {"class": "hotel_new_list"})
             # 数据转换
             hotel_total_list = self.bsobj_to_hotelobj(hotel_list)
             # 批量插入至数据库
             self.insert(hotel_total_list)
         except Exception as e:
             print e
             continue
Exemplo n.º 7
0
    def get_img_ranking_list(self, no="6"):
        """ 爬取排行榜数据(默认国际排行榜)
            1.北海道/东北
            2.关东
            3.中部
            4.近畿
            5.中国/四国
            6.九州/冲绳
            7.国际
        """
        nodic = {"0": "北海道/东北",
                 "1": "关东",
                 "2": "中部",
                 "3": "近畿",
                 "4": "中国/四国",
                 "5": "九州/冲绳",
                 "6": "国际"
                 }
        html = se.get(self.ranking_url + no)
        bsObj = Bs(html.text, 'lxml')
        href = bsObj.findAll("a",{"class":re.compile("^(work)")})
        # 当前周
        week = bsObj.findAll("span",{"class": "_about"})[0].string
        downloadPath = os.path.join(path, week + nodic.get(no) + u"排行榜")
        for link in href:
            if(link.attrs['href'][0]=='/'):
                self.img_list.append(self.indexurl + link.attrs['href'][1:])
            else:
                self.img_list.append(self.indexurl + link.attrs['href'])
        try:
            # 返回根文件夹
            os.chdir(os.path.dirname(downloadPath))
            os.mkdir(week + nodic.get(no) + u"排行榜")
        except Exception as e:
            pass
        finally:
            os.chdir(downloadPath)
        print "ranking " + week + " are Searching!"

        # 获取当前页所有图片 (下载)
        for link in self.img_list:
            # 此处要添加代理
            html = se.get(link, headers=self.headers)
            bsObj = Bs(html.text, 'lxml')
            try:
                src = bsObj.find("img", {"class": "original-image"})
                if src:
                    self.down_list.append({"src": src.attrs['data-src'].decode("unicode-escape"),
                                           "title": src.attrs['alt']})
                    print "title:" + src.attrs['alt']
            except AttributeError as e:
                print "NOT Found"
            except Exception as e:
                print e
        # 开启三个线程下载图片
        self.threading(3)
Exemplo n.º 8
0
def html_table_to_array(html_string):
    """Convert an HTML table to a list of lists."""
    from bs4 import BeautifulSoup as Bs
    import json
    if (len(Bs(html_string).findAll("tr")) > 0 and html_string is not None
            and html_string is not ""):
        return json.dumps([[cell.string or '' for cell in row.findAll("td")]
                           for row in Bs(html_string).findAll("tr")])
    else:
        return html_string
Exemplo n.º 9
0
 def expert_parser(url, i):
     nonlocal end, trigger, new_post
     try:
         logging.info('\n  [%s]... ' % url)
         # Parsing http://yur-gazeta.com/news/
         self.send(['log', '\n Loading [%s]... ' % url])
         req = s.get(url)
         soup = Bs(req.text, 'html.parser')
         while end is False:
             page = soup.find('a', attrs={'class': 'right'})
             tag = soup.find(
                 'ul', attrs={'class': 'material-list imageRight'})
             for li in tag.children:
                 if li.a['href'] != LAST_POST[i][0]:
                     if li.find('a', attrs={'class': 'title private'
                                            }) is None:
                         if trigger is True and li.a[
                                 'href'] != '1' and li.a['href'] != 1:
                             new_post = li.a['href']
                             trigger = False
                         req = s.get(li.a['href'])
                         soup = Bs(req.text, 'html.parser')
                         title = soup.find('h1',
                                           attrs={'class': 'mat-title'})
                         title = title.get_text(
                         ) if title is not None else 'oops'
                         article = soup.find(
                             'article',
                             attrs={'id': 'material-content'})
                         article = article.get_text(
                         ) if article is not None else 'oops'
                         author = soup.find(
                             'a',
                             attrs={'class': 'author-name no-link'})
                         author = author.get_text(
                         ) if author is not None else 'oops'
                         posts.append(
                             PostModel(
                                 str(author + '\n' + title + '.' +
                                     article), hashtags['expert'],
                                 li.a['href'], imgs['expert']))
                 else:
                     end = True
                     break
             req = s.get(page['href']) if page is not None else req
             soup = Bs(req.text, 'html.parser')
     except ConnectionError:
         self.send(['log', '\n Could not load [%s]' % url])
     end = False
     # For the adding last post
     trigger = True
     if new_post != '' and new_post != '1' and new_post != 1:
         LAST_POST[i][0] = new_post
         new_post = ''
Exemplo n.º 10
0
    def response(self):
        bs1 = Bs(self.req.get(self.u.format('')).text, 'html.parser')
        token = bs1.find('input', {'name': 'csrfmiddlewaretoken'})
        res = self.req.post(self.u.format('/download'),
                            data={
                                'tweet': self.lnk,
                                'csrfmiddlewaretoken': token['value']
                            })
        bs2 = Bs(res.text, 'html.parser')
        linku = bs2.find('a', {'class': 'expanded button small float-right'})

        return linku['href']
Exemplo n.º 11
0
    def get_wikipedia_results_recursive(hops, wiki_pages, num_links=None):
        text = ''

        if hops > 0:
            links = []
            for wiki in wiki_pages:
                url = 'https://en.wikipedia.org/wiki/' + wiki['href']
                html = api_get(
                    url,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) '
                        'Gecko/20100101 Firefox/74.0'
                    })

                content = Bs(html.text,
                             features="lxml").find(class_='mw-parser-output')
                # To get rid of this warning, pass the additional argument 'features="lxml"' to the BeautifulSoup constructor.

                for child in content.children:
                    if child.name == 'p':
                        text += re.sub(' +', ' ',
                                       child.get_text().replace("\n",
                                                                " ")) + "\n"
                        for link in child.find_all('a'):
                            if link.has_attr("href") and num_links is not None:
                                if link["href"].startswith(
                                        "/wiki/") and len(links) < num_links:
                                    links.append(link)

            text += get_wikipedia_results_recursive(hops - 1, links)
        else:
            for wiki in wiki_pages:
                url = 'https://en.wikipedia.org/wiki/' + wiki['href']
                html = api_get(
                    url,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) '
                        'Gecko/20100101 Firefox/74.0'
                    })

                content = Bs(html.text,
                             features="lxml").find(class_='mw-parser-output')
                # To get rid of this warning, pass the additional argument 'features="lxml"' to the BeautifulSoup constructor.

                for child in content.children:
                    if child.name == 'p':
                        text += re.sub(' +', ' ',
                                       child.get_text().replace("\n",
                                                                " ")) + "\n"

        return text
Exemplo n.º 12
0
def get_rss_content(rss_info_list, img_file_path):
    rss_content = []
    for rss in rss_info_list:
        url = rss[1]
        page = get_rss_page(url)
        soup = Bs(page, 'lxml')
        try:
            source = re.search(
                r'>\w+',
                str(soup.select('span[class="from"] > a')[0])).group(0)
            source = re.search(r'\w+', source).group(0)
            content = str(soup.select('div[class="article_body"]')[0])
            content = re.sub(r'\\.', r'', content)
            content_soup = Bs(content, 'lxml')
        except Exception as e:
            print('基本信息未获取')
        slug = ''.join(
            random.sample(
                'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789',
                8))
        img_list = []
        img_url_list = content_soup.select('img')
        for img in img_url_list:
            img = img['src']
            try:
                img_name = re.search(r'\w+.(jpg|png)', img).group(0)
                img_list.append(img_name)
                img_download_path = img_file_path + '/' + '%s' % slug
                if os.path.exists(img_download_path):
                    request.urlretrieve(
                        img, img_download_path + '/' + '%s' % img_name)
                else:
                    os.makedirs(img_download_path)
                    request.urlretrieve(
                        img, img_download_path + '/' + '%s' % img_name)
                print('----------------- 下载 %s 完成 -----------------' %
                      img_name)
                content = re.sub(r'http://img\d.tuicool.com/',
                                 r'/upload/blog/%s/' % slug, content)
                content = re.sub(r'!web', '', content)
            except Exception as e:
                print(u'图片下载错误')

        if len(img_list) == 0:
            rss.append('')
        else:
            rss.append(img_list[0])
        rss.append(slug)
        rss.append(content)
        rss.append(source)
        rss_content.append(rss)
    return rss_content
    def get_pagination_urls(self, base_url):
        """获取采集项目数据信息的分页URL"""

        url = base_url + '1'
        r = requests.get(url, headers=common.get_header())

        soup = Bs(r.text)
        page_data = Bs(str(
            soup.find_all(class_='ProjectList'))).find_all(class_='stat')
        max_page = math.ceil(
            int(page_data[0].next_sibling.next_sibling.text) / 20)

        return (base_url + str(i) for i in range(1, max_page + 1))
 def test1(self):
     md = Markdown(extensions=[MarkdownImageExpander()])
     source = """
 ![](example.png)
 """
     mustbe = """
 <img alt="" src="example.png">
 """
     self.assertEqual(
         Bs(md.convert(textwrap.dedent(source)),
            "html.parser").prettify().replace("\n", ""),
         Bs(textwrap.dedent(mustbe),
            "html.parser").prettify().replace("\n", ""))
    def get_user_sales_detail(self, item_detail_url, input_datas):
        """
        ユーザ販売履歴の情報を取得する
        params:
            input_datas: excel inputデータ
        """
        user_detail_set = {
            'ave_days_from_send': '',
            'sales_history_nums': '',
            'sales_day': ''
        }

        html = requests.get(item_detail_url).text.encode('utf-8')
        source = Bs(html, 'html5lib')
        user_url = top_page + source.find('div', id='item_subcol').find(
            'p', id='buyer_name').find('a')['href']

        html = requests.get(user_url).text.encode('utf-8')
        source = Bs(html, 'html5lib')
        detail_blocks = source.find('ul', attrs={
            'id': 'detailed_list'
        }).find_all('li')
        for detail_block in detail_blocks:
            if detail_block.find('h3').text.strip() == '発送までの平均日数':
                user_detail_set['ave_days_from_send'] = (
                    detail_block.find('div').find('p').text.strip())

        for input_data in input_datas:
            if input_data['option'] == '販売履歴ページ数':
                sales_page_num = int(input_data['value'])

        for sales_page_index in range(1, sales_page_num + 1):
            user_detal_url = (user_url.replace(
                '.html', '/sales_{0}.html'.format(sales_page_index)))
            html = requests.get(user_detal_url).text.encode('utf-8')
            source = Bs(html, 'html5lib')
            sales_detail_blocks = (source.find(
                'dl', id='buyeritemtable').find_all('div',
                                                    id='buyeritemtable_body'))
            if len(sales_detail_blocks) != 0:
                span_blocks = []
                for sales_detail_block in sales_detail_blocks:
                    span_blocks = (sales_detail_block.find_all('span'))
                    if self.item_name in span_blocks[0].text.strip() and len(
                            span_blocks) == 3:
                        user_detail_set['sales_history_nums'] += span_blocks[
                            1].text.strip().replace('成約:', '') + ','
                        user_detail_set[
                            'sales_day'] += span_blocks[2].text.strip() + ','
        return user_detail_set
Exemplo n.º 16
0
def login(session, username, password):
    password_text = 'رمز عبور را وارد کنید'
    successful_login_text = 'سفارش‌های من'
    failed_login_text = r'\u0627\u0637\u0644\u0627\u0639\u0627\u062a \u06a9\u0627\u0631\u0628\u0631\u06cc \u0646\u0627\u062f\u0631\u0633\u062a \u0627\u0633\u062a'

    # username
    url = 'https://www.digikala.com/users/login-register/'
    r1_dom = Bs(session.get(url).text, 'html.parser')

    payload = {
        'login[email_phone]': username,
        'rc': r1_dom.select('input[name=rc]')[0]['value'],
        'rd': r1_dom.select('input[name=rd]')[0]['value'],
    }
    r2 = session.post(url, data=payload)
    if r2.status_code != 200:
        print('connection error, code: %s' % r2.status_code)
        return False

    # password
    if password_text in r2.text:
        r2_dom = Bs(r2.text, 'html.parser')

        payload = {
            'login[password]': password,
            'rc': r2_dom.select('input[name=rc]')[0]['value'],
            'rd': r2_dom.select('input[name=rd]')[0]['value'],
        }
        r3 = session.post(r2.url, data=payload)
        if r3.status_code != 200:
            print('connection error, code: %s' % r3.status_code)
            return False

        # succeed :)
        if successful_login_text in r3.text:
            print('successfully logged in')
            return True
        # wrong data :(
        elif failed_login_text in r3.text:
            print('wrong data!')
            return False
        # sth else :):
        else:
            print('unknown error')
            return False
    else:
        print('unknown error')
        return False
Exemplo n.º 17
0
 def crawl_append(self, crawl_results):
     for row in crawl_results:
         self.driver.get(row['source_url'])
         html = self.driver.page_source
         self.soup = Bs(html, 'html.parser')
         self.page_source = self.soup.select('#bo_v_atc')
         event_content = self.soup.select('#bo_v_con > div.event_ctt')
         if len(event_content) == 0:
             row['ctn'] = ''
         else:
             row['ctn'] = event_content[0].text
         row['page_source'] = str(self.page_source)
         temp_img_src = self.soup.select('#bo_v_img > a > img')
         '''if len(temp_img_src) == 0:
             img_source = self.soup.select('#contents > div.cont > div.view_wrap > div.info_sec > p > a > img')
         else:
             img_source = temp_img_src'''
         ab = datetime.datetime.now()
         date_now = ab.strftime('%Y%m%d%H%M%S')
         file_name = date_now + str(ab.microsecond)
         if len(temp_img_src) > 0:
             temp_src = temp_img_src[0].attrs.get('src')
             urllib.request.urlretrieve(temp_src, '../../originalDatas/' + file_name + '.png')
             img_src = file_name + '.png'
         else:
             img_src = ''
         row['img_src'] = img_src
         #self.cm.content_insert(row, 'original')
     return crawl_results
Exemplo n.º 18
0
    def test_correct_services_in_form(self):
        """Czy klient widzi prawidłowe usługi (swojego użytkownika, niezablokowane)"""
        user_correct = self.create_user('active_1')
        user_wrong = self.create_user('active_2')

        service_wrong = self.create_service(user_wrong, 'long_2')
        service_not_active = self.create_service(user_correct, 'not_active')
        services_correct = []
        services_types = ['short_1', 'short_2', 'long_1']
        for service in services_types:
            services_correct.append(self.create_service(user_correct, service))

        client = self.create_client(user_correct)
        self.authorize_client(client)
        response = self.client.get(f'/{user_correct}/panel/')
        soup = Bs(response.content.decode(), features="html.parser")
        option_wrong = soup.find("option", {"value": {service_wrong.id}})
        option_not_active = soup.find("option",
                                      {"value": {service_not_active.id}})

        self.assertFalse(option_wrong)
        self.assertFalse(option_not_active)

        for service in services_correct:
            options = soup.find("option", {"value": {service.id}})

            self.assertEqual(options.get_text(), service.name)
    def test_default_table(self):
        results_table_div = self.driver.find_element_by_id('results')
        rows = Bs(results_table_div.get_attribute('innerHTML'),
                  features='lxml').find('table').findAll('tr')

        self.assertEqual(len(rows), self.tc + 1)

        headings = rows[0].findAll('th')
        headings_txt = [x.text for x in headings]
        self.assertEqual(
            headings_txt,
            [f'{self.tc}/{self.tc} themes', 'date', 'stars', 'commit'],
        )

        for i, row in enumerate(rows[1:]):
            tds = row.findAll('td')
            tds_txt = [x.text for x in tds]
            self.assertEqual(
                tds_txt,
                [
                    self.themes[i].cname,
                    self.themes[i].commit_date[0:10],
                    str(self.themes[i].stargazers_count),
                    self.themes[i].commit_sha[0:6],
                ],
            )
Exemplo n.º 20
0
 def crawl_append(self, crawl_results):
     for row in crawl_results:
         self.driver.get(row['source_url'])
         html = self.driver.page_source
         self.soup = Bs(html, 'html.parser')
         self.page_source = self.soup.select(
             '#content > div.schedule > div')
         row['page_source'] = str(self.page_source)
         event_content = self.soup.select(
             '#content > div.schedule > div > div > dl:nth-child(11) > dd')
         if len(event_content) > 0:
             row['ctn'] = event_content[0].text
         else:
             row['ctn'] = ''
         temp_img_src = self.soup.select(
             '#PageWrap > div > div.ViewInfo > p > img')
         ab = datetime.datetime.now()
         date_now = ab.strftime('%Y%m%d%H%M%S')
         file_name = date_now + str(ab.microsecond)
         if len(temp_img_src) > 0:
             temp_src = temp_img_src[0].attrs.get('src')
             encoding_url = parse.urlparse(temp_src[3:len(temp_src)])
             print(encoding_url)
             urllib.request.urlretrieve(
                 self.url_base + quote(encoding_url.path),
                 '../../originalDatas/' + file_name + '.png')
             img_src = file_name + '.png'
         else:
             img_src = ''
         row['img_src'] = img_src
         print(row['event_name'])
         self.cm.content_insert(row, 'original')
Exemplo n.º 21
0
def Gratis(num,msg):
	req=requests.Session()
	req.headers.update({'Referer':'http://sms.payuterus.biz/alpha',
		'user-agent':'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36',
		'Connection':'keep-alive',
		'Pragma':'no-cache',
		'Cache-Control':'no-cache',
		'Origin':'http://sms.payuterus.biz',
		'Upgrade-Insecure-Requests':'1',
		'Content-Type':'application/x-www-form-urlencoded',
		'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
		'Accept-Encoding':'gzip, deflate',
		'Accept-Language':'id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7',
		'Cookie':'_ga=GA1.2.131924726.1560439960; PHPSESSID=jjrqqaakmfcgfgbtjt8tve5595; _gid=GA1.2.1969561921.1561024035; _gat=1',
		})

	capt=req.get('http://alpha.payuterus.biz/').text
	bs1=Bs(capt,'html.parser')
	cap=(bs1.find('span',{'id':None}).text).split(' ')

	hasilcapt=int(cap[0]) + int(cap[2])
	key=bs1.find('input',{'name':'key'})['value']

	dataq={'nohp':num,
	'pesan':msg,
	'captcha':hasilcapt,
	'key':key,
	}
	res=req.post('http://alpha.payuterus.biz/send.php',data=dataq).text

#	print(res)
	return str(res)
Exemplo n.º 22
0
 def _scrape_raw_product_details(self, response):
     raw_html = response.content
     soup = Bs(raw_html, 'html.parser')
     self._set_name(soup)
     self._set_price(soup)
     self._set_original_price(soup)
     self._set_rating(soup)
Exemplo n.º 23
0
def init():

    try:
        print("[+] Requesting...")
        print(f'{link}')
        response = requests.get(link)
        response.raise_for_status()
        print("[+] Requesting succeded")

        # making the soup
        soup = Bs(response.content, "html.parser")
        print("[+] Html parsing succeded")

        # scraping token
        jeton = soup.select_one('#jeton').get('value')

        # getting the cookie
        cookie = response.cookies.get_dict()

        return soup, jeton, cookie

    except HTTPError as err:

        print(f'HTTP error occured : {err}')
        print('[-] Requesting failed.')

    except Exception as err:

        print(f'Other error occured : {err}')
        print('[-] Requesting failed.')
Exemplo n.º 24
0
    def get_number_of_results(self, response):
        raw_html = response.content
        soup = Bs(raw_html, 'html.parser')
        klass = '_2yAnYN'
        try:
            raw_results = soup.find('span', {'class': klass}).get_text()
            if raw_results is None:
                logging.error("No Results found for <h1> class: " + klass)
                exit()
            else:
                start = raw_results.index('of')
                end = raw_results.index('results')
                no_of_results = int(raw_results[start + 3:end - 1].replace(',', ''))
                logging.info('Number of results for ' + self.searchterm + ':' + str(no_of_results))
                if no_of_results > 10000:
                    print('Too many' + '(' + str(no_of_results) + ')results for ' + self.searchterm + '.\
 Please extend your search term.')
                    print('Do you still want to continue, it will take a lot of time.(Y/N)')
                    choice = input()
                    if choice == 'Y' or choice == 'y':
                        return self.get_max_page(response)
                    elif choice == 'N' or choice == 'n':
                        exit()
                    else:
                        print('invalid choice, exiting')
                        exit()
                else:
                    print('No of results: ', no_of_results)
                    return self.get_max_page(response)
        except AttributeError:
            logging.error("screen format different for this search result, cant continue" + self.searchterm)
            return self.handle_different_screen_format()
Exemplo n.º 25
0
def scrapper(query):
    """
    :parameter str query: end of url to search
    :rtype: list of dict
    """
    query = query
    infoboxes = []
    url = 'https://en.wikipedia.org/wiki/' + query
    try:
        raw = urlopen(url)
        print('{} opened successfully'.format(url))
        soup = Bs(raw, features='lxml')
        boxes = soup.find_all('table', {'class': 'infobox vcard'})
        for table in boxes:
            content_dict = dict()
            for br in soup.find_all('br'):
                br.replace_with('\n')
            for tr in table.find_all('tr'):
                if len(tr.contents) > 1:
                    content_dict[string_clean(tr.contents[0].text,
                                              True)] = string_clean(
                                                  tr.contents[1].text)
                elif tr.text:
                    content_dict[string_clean(tr.text, True)] = None
            infoboxes.append(content_dict)

        return infoboxes
    except URLError as error:
        if str(error.reason) == '[Errno 11001] getaddrinfo failed':
            messagebox.showerror(
                'URLError', message='No connection to internet.\nProbably...')
        else:
            print('|{}|'.format(error.reason))
            messagebox.showerror('URLError', message=error.reason)
Exemplo n.º 26
0
    def crawl(self):
        self.driver.get(self.url.format(page=1))
        self.driver.maximize_window()

        now = datetime.datetime.now()
        reg_date = now.strftime('%Y-%m-%d %H:%M:%S')
        for page in range(1, 27):
            self.driver.get(self.url.format(page=page))
            length = self.driver.find_elements_by_xpath(
                '//*[@id="content"]/table/tbody/tr')
            for content in range(2, len(length) + 1):
                url = self.driver.find_elements_by_xpath(
                    '//*[@id="content"]/table/tbody/tr[{content}]/td[2]/a'.
                    format(content=content))
                self.tempUrl = url[0].get_attribute('href')
                self.driver.find_element_by_xpath(
                    '//*[@id="content"]/table/tbody/tr[{content}]/td[2]/a'.
                    format(content=content)).click()

                contents = self.driver.find_element_by_xpath(
                    '//*[@id="content"]').text
                html = self.driver.page_source
                self.soup = Bs(html, 'html.parser')
                data = self.soup.select('#content')

                _dict['convention_name'] = 'exco'
                _dict['contents'] = contents
                _dict['page_source'] = str(data)
                _dict['source_url'] = self.tempUrl
                _dict['home_page'] = self.url
                _dict['reg_date'] = reg_date

                self.content_insert(_dict)
                self.driver.back()
Exemplo n.º 27
0
def requestEmploi(jeton, cookie, group, id):
    try:

        print(f'[+] Setup to request {group} emploi...')
        data = {}
        data["jeton"] = jeton
        data["id"] = id

        print("[+] Requesting emploi...")
        currentTime = timer()

        response = requests.post(link, cookies=cookie, data=data)

        elapsed = timer() - currentTime
        response.raise_for_status()
        print("[+] Requesting succeded " + str(elapsed))
        return Bs(response.content, "html.parser")
    except HTTPError as err:

        print(f'HTTP error occured : {err}')
        print('[-] Requesting failed.')

    except Exception as err:

        print(f'Other error occured : {err}')
        print('[-] Requesting failed.')
Exemplo n.º 28
0
def search(query):
    global pil
    c = 1
    req = ses.get('http://149.56.24.226/?s=' + query)
    bs = Bs(req.text, 'html.parser')
    hsl = bs.find_all('div', {'class': 'col-xs-9 col-sm-10 search-content'})
    for i in hsl:
        tit = i.find('a', {'rel': 'bookmark'})
        info['title'].append((tit.text, tit['href']))
    if len(info['title']) == 0:
        print("Tidak dapat menemukan judul film")
        return True

    print("\n\t[ Result ]")
    for x in info['title']:
        print(f"{c}. {x[0]}")
        c += 1

    pil = int(input("_> pilih: "))
    if pil <= 0:
        print("index out of ranges")
        return True

    print(" *Bypassing, please wait...")
    bypass(info['title'][pil - 1][1], info['title'][pil - 1][0])
Exemplo n.º 29
0
def get_result(content):
    soup = Bs(content, 'lxml')
    job_description = soup.select('dd[class="job_bt"]')
    job_description = str(job_description[0])
    rule = re.compile(r'<[^>]+>')
    result = rule.sub('', job_description)
    return result
Exemplo n.º 30
0
def get_images(i):
	page = requests.get(f'https://pamskenya.com/?page={i}').text
	soup = Bs(page,'lxml')
	img = soup.find_all('img', class_='centered-image')
	for im in img:
		img_list.append((im["src"],im['alt']))

		if not os.path.exists('./images'):
			os.makedirs('./images')

		for img in img_list:
			url = img[0]
			name = str(img[1]).strip()
			
			
			
			if not os.path.exists(f'./images/{name}.jpg'):
				print("getting file")
				response = requests.get(url, stream=True)
				try:
					with open(f'./images/{name}.jpg', 'wb') as out_file:
						print("writing to file")
						shutil.copyfileobj(response.raw, out_file)
						del response
						print("writing Done")

				except :
					fail.add(name)
					print(fail)
					pass