Пример #1
0
def scraper2():
    c = 0
    try:
        r2 = requests.get("https://www.elwatan.com/edition/actualite")
        unicode_str2 = r2.content.decode('utf8')
        encoded_str2 = unicode_str2.encode("utf-8")
    except:
        print("request failed try again")
        return
    if r2.status_code == 200:
        array = []
        soup = bf(encoded_str2, "html.parser")
        title = soup.find_all('h3', {'class': 'title-14'})
        img = soup.find_all('article', {'class': 'post post-tp-24'})
        for i in img:
            im = bf(str(i), "html.parser")
            try:
                im = im.find_all('img')
                im = im[0]["src"]
                array.append(im)
            except:
                array.append("https://i.ibb.co/8cfP6ZD/elwatan.png")
        for i in title:
            link = bf(str(i), "html.parser")
            link = link.find_all('a')
            link = link[0]["href"]
            articles(title=i.get_text().encode("utf-8"),
                     link=link,
                     img=array[c],
                     category=category(str(i.get_text().encode("utf-8")),
                                       0)).save()
            c = c + 1
Пример #2
0
def scraper4():
    #   try to send a get request to the websites
    try:
        url = "https://tsa-algerie.com"
        working_proxy = check_proxies()
        if working_proxy != 0:
            scraper = cfscrape.create_scraper()
            proxies = {"http": working_proxy, "https": working_proxy}
            r4 = scraper.get(url, proxies=proxies, allow_redirects=True, timeout=(10, 20))
            unicode_str4 = r4.content.decode('utf8')
            encoded_str4 = unicode_str4.encode("utf-8")
            articles = []
            if r4.status_code == 200:

                soup = bf(encoded_str4, "html.parser")
                title = soup.find_all('h2', {'class': 'ntdga__title transition'})
                for i in range(0, 10):
                    link = bf(str(title[i].encode("utf-8")), "html.parser")
                    link = link.find_all('a')
                    link = link[0]["href"]
                    articles.append(str(title[i].get_text().encode("utf-8") + ": " + str(link)).replace(",", ""))

                list_to_csv(articles, "tsa")
                readcsv("tsa.csv")
                dataframe_to_csv("result", tsport, thealth, tscience)
                time.sleep(tsa_freq)

        else:
            print("no working proxy found")
    #   if request failed
    except:
        print("request failed try again")

    scraper4()
Пример #3
0
def scraper3():
    try:
        r3 = requests.get("http://www.aps.dz")
        unicode_str3 = r3.content.decode('utf8')
        encoded_str3 = unicode_str3.encode("utf-8")
    except:
        print("request failed try again")
        return
    if r3.status_code == 200:

        soup = bf(encoded_str3, "html.parser")
        title = soup.find_all("h3", {"class": "allmode-title"})
        img = soup.find_all('div', {'class': 'allmode-img-top'})
        img = img + soup.find_all('div', {'class': 'allmode-img'})
        array = []
        c = 0
        for i in img:
            im = bf(str(i), "html.parser")
            try:
                im = im.find_all('img')
                im = im[0]["src"]
                array.append(im)
            except:
                array.append("https://i.ibb.co/1z2d99g/aps.jpg")
        for i in title:
            link = bf(str(i), "html.parser")
            link = link.find_all('a')
            link = link[0]['href']
            articles(title=i.get_text().encode("utf-8"),
                     link="http://www.aps.dz" + link,
                     img="http://www.aps.dz" + array[c],
                     category=category(str(i.get_text().encode("utf-8")),
                                       0)).save()
            c = c + 1
Пример #4
0
def scraper3():
    try:
        r3 = requests.get("http://www.aps.dz")
        unicode_str3 = r3.content.decode('utf8')
        encoded_str3 = unicode_str3.encode("utf-8")
    except:
        print("request failed try again")
        return
    articles = []
    if r3.status_code == 200:

        soup = bf(encoded_str3, "html.parser")
        title = soup.find_all("h3", {"class": "allmode-title"})

        for i in title:
            link = bf(str(i), "html.parser")
            link = link.find_all('a')
            link = link[0]['href']
            articles.append(str(i.get_text().encode("utf-8") + ": " + "http://www.aps.dz" + str(link)).replace(",", ""))

        list_to_csv(articles, "aps")
        readcsv("aps.csv")
        dataframe_to_csv("result", tsport, thealth, tscience)
        time.sleep(aps_freq)
        scraper3()
Пример #5
0
def scraper4():
    #   try to send a get request to the websites
    try:
        url = "https://tsa-algerie.com"
        working_proxy = check_proxies()
        if working_proxy != 0:
            scraper = cfscrape.create_scraper()
            proxies = {"http": working_proxy, "https": working_proxy}
            r4 = scraper.get(url,
                             proxies=proxies,
                             allow_redirects=True,
                             timeout=(10, 30))
            unicode_str4 = r4.content.decode('utf8')
            encoded_str4 = unicode_str4.encode("utf-8")
            if r4.status_code == 200:
                soup = bf(encoded_str4, "html.parser")
                title = soup.find_all('h2',
                                      {'class': 'ntdga__title transition'})
                for i in range(0, 10):
                    link = bf(str(title[i].encode("utf-8")), "html.parser")
                    link = link.find_all('a')
                    link = link[0]["href"]
                    articles1.append(str(+ ": " + str(link)).replace(",", ""))
                    articles(title=title[i].get_text().encode("utf-8"),
                             link=link,
                             img="https://i.ibb.co/QMZ7VBg/tsa.jpg",
                             category=category(
                                 str(i.get_text().encode("utf-8")), 0)).save()

        else:
            print("no working proxy found")
    #   if request failed
    except:
        print("request failed try again")
Пример #6
0
def checkLink():

    global p
    url = """https://github.com/search?l=JavaScript&o=desc&p=%(page)s&q=%(location)s&ref=advsearch&type=Users"""
    url2 = """https://github.com/"""
    data = {"page": p, "location": "location%3AIndia"} #location can be set here insted of India, any desired location could be entered.
    site = (url % data)
    print site
    print "\n\n"
    try:
        data = urllib2.urlopen(site, data=None, timeout=60)
        tag = bf(data.read())
        user = tag.findAll('div',{'class': 'user-list-info'})
        mydb = MySQLdb.connect(host='localhost',
                                user='******',
                                passwd='password',
                                db='gitcrawler')# change the database information
        cursor = mydb.cursor()
        for r in user:
            username = r.find('a').string
            locate = r.find('li')
            location = locate.text
            date = r.findAll('li')[2:3]
            fullname = r.contents[2:3]
            name = str(fullname)[9:-8]
            site2 = (url2+username)
            userpage = urllib2.urlopen(site2,data=None, timeout=60)
            userinfo = bf(userpage.read())
            information = userinfo.findAll('a', {'class': 'url'}, text=True)
            link = None
            for x in information:
                link = ''.join(x.findAll(text=True))
            print link
            joining_date = "Not Available"
            for n in date:
                joindate = n.findAll('span')[1:]
                for s in joindate:
                    joining_date = ''.join(s.findAll(text=True))
            email = r.findAll('a')[1:2]
            mail = None
            for q in email:
                mail = q.get('data-email')
                mail = urllib.unquote(mail).decode('utf8')
                print "---------------------------------------------------------------------------"
                print '|%s | %s | %s | %s | %s | %s |' % (location, username, joining_date, mail, name, link)
            try:
                len(mail)# change the name of table in insert query as per language.
                cursor.execute('''INSERT INTO javascript(username, language, currentlocation, email, joining_date, name, link) VALUES(%s, %s, %s, %s, %s, %s, %s)''', (username, lang, location, mail, joining_date, name,link))
            except TypeError:
                cursor.execute('''INSERT INTO javascript(username, language, currentlocation, email, joining_date, name, link) VALUES(%s, %s, %s, %s, %s, %s, %s)''', (username, lang, location, mail, joining_date, name, link))
        mydb.commit()
        cursor.close()
        p += 1
    except urllib2.HTTPError, e:
        print "Http error to many request , sleeping for 10 seconds..."
        print e
        sleep(10)
Пример #7
0
def main(url):      #娛樂
  domain = 'https://www.setn.com'
  three = requests.get(url).text
  three=bf(three, 'html.parser')
  title=three.find('h3',{'class','view-li-title'}).find('a',{'class':'gt'}).text
  text=three.find('h3',{'class','view-li-title'}).find('a',{'class':'gt'})['href']
  text_1=requests.get(domain+text)
  text_1=bf(text_1.text,'html.parser')
  text_1=text_1.find('div',{'class':'Content2'}).text
  text_1=re.sub('(圖/\w*)','',text_1)
  title_content=(title,text_1)
  return   list(title_content)
Пример #8
0
 def get_comment(self, html):
     bs4_comment = bf(html, 'lxml')
     comment = bs4_comment.find(name='div', class_='mouth-main')
     comment = str(comment)
     res1 = re.compile('<style.*?>.*?</style>', re.S)
     content = res1.sub('', comment)
     res2 = re.compile('<script.*?>.*?</script>', re.S)
     content = res2.sub('', content)
     res3 = re.compile('\n', re.S)
     content = res3.sub('', content)
     # comment = bs4_comment.select('div .mouth-main')
     bs4_comment = bf(content, 'lxml')
     print(bs4_comment.get_text())
Пример #9
0
 def getSmallCounty(countyURL):
     get_r = requests.get(countyURL)
     get_soup = bf(get_r.text, "html.parser")
     smallCounty = get_soup.select("area")
     if len(smallCounty) != 0:
         for i in smallCounty:
             smallCountyLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" +
                                    i["href"].rstrip())
             #進入鄉鎮meta data抓鄉鎮名
             get_r2 = requests.get("http://cmdweb.pcc.gov.tw/pccms/owa/" +
                                   i["href"].rstrip())
             get_soup2 = bf(get_r2.text, "html.parser")
             smallCountyNameList.append(
                 get_soup2.find("font", color="red").text)
Пример #10
0
def scraper1():

    #   send request and encode results

    try:
        r1 = requests.get("https://www.liberte-algerie.com/actualite")
        unicode_str1 = r1.content.decode('utf8')
        encoded_str1 = unicode_str1.encode("utf-8")
    except:
        print("no working proxy found")
        return

    if r1.status_code == 200:

        #   create beautiful soup object

        soup = bf(encoded_str1, "html.parser")

        #   get article title

        title = soup.find_all('a', {'class': 'title'})
        mg = soup.find_all('div', {'class': 'span-8'})
        img = mg[0].find_all('li')
        array = []
        co = 0
        for i in img:
            im = bf(str(i), "html.parser")
            try:
                im = im.find_all('img')
                im = im[0]["src"]
                array.append(im)
            except:
                array.append("https://i.ibb.co/fDDLYQc/libre.jpg")
        c = 0

        #   create tables to save articles1

        for i in title:

            #   get link and article as text

            link = title[c]['href']
            articles(title=i.get_text().encode("utf-8").strip(),
                     link="https://www.liberte-algerie.com" + link,
                     img=array[c],
                     category=category(str(i.get_text().encode("utf-8")),
                                       0)).save()
            c = c + 1
Пример #11
0
def smallCounty(URL):
    r = requests.get(URL)
    soup = bf(r.text, "html.parser")
    mainLink = soup.select("frameset frame")
    mainLink = "http://cmdweb.pcc.gov.tw" + mainLink[0]["src"]  #找iframe網址
    r2 = requests.get(mainLink)
    soup2 = bf(r2.text, "html.parser")
    county = soup2.find("map").select("area")
    #縣市Link
    for i in county:
        countyLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" +
                          i["href"].rstrip())
    #跨區Link和縣市名稱
    for i in countyLink:
        r3 = requests.get(i)
        soup3 = bf(r3.text, "html.parser")
        acrossNameList.append(soup3.select("font")[1].text[:-6])
        countyNameList.append(soup3.select("font")[1].text[1:3])
        acrossLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" +
                          soup3.find("a")["href"].rstrip())
    #鄉鎮Link和鄉鎮名稱
    def getSmallCounty(countyURL):
        get_r = requests.get(countyURL)
        get_soup = bf(get_r.text, "html.parser")
        smallCounty = get_soup.select("area")
        if len(smallCounty) != 0:
            for i in smallCounty:
                smallCountyLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" +
                                       i["href"].rstrip())
                #進入鄉鎮meta data抓鄉鎮名
                get_r2 = requests.get("http://cmdweb.pcc.gov.tw/pccms/owa/" +
                                      i["href"].rstrip())
                get_soup2 = bf(get_r2.text, "html.parser")
                smallCountyNameList.append(
                    get_soup2.find("font", color="red").text)

    for i in countyLink:
        getSmallCounty(i)

    #鄉鎮Link產生df
    smallCountyNameList.extend(countyNameList)
    smallCountyLink.extend(acrossLink)
    dic_smallCounty = {"鄉鎮": smallCountyNameList, "table連結": smallCountyLink}
    df_smallCounty = pd.DataFrame(dic_smallCounty)
    return df_smallCounty


#df_smallCounty.to_csv("smallCounty.csv",encoding="utf_8_sig")
Пример #12
0
def job_seek():
    target_url = 'https://www.104.com.tw/jobbank/custjob/index.php?r=cust&j=503a4224565c3e2430683b1d1d1d1d5f2443a363189j48&jobsource=joblist_b_relevance#info06'
    print('Start parsing appleNews....')
    rs = requests.session()
    res = rs.get(target_url)
    res.encoding = 'utf-8'
    soup = bf(res.text, 'html.parser')
    content = ""
    temp = []
    reback = []
    for date in soup.select('.joblist_cont .date'):
        if date.text == '':
            temp.append('緊急!!重點職務')
        else:
            temp.append(date.text)
    for v, data in enumerate(soup.select('.joblist_cont .jobname a'), 0):
        link = data['href']
        title = data['title']
        content += '發布時間->{}\n工作名稱->{}\n連結網址->{}\n'.format(
            temp[v], title, 'https://www.104.com.tw' + link)
        if v % 5 == 0:
            if v == 0:
                continue
            reback.append(TextSendMessage(text=content))
            content = ''
    return reback
Пример #13
0
def get_all_url():
    rsp = requests.get(main_url, timeout=10)
    rsp_html = rsp.text.encode('ISO-8859-1').decode('gbk').encode(
        'utf-8').decode('utf-8')
    # print(rsp_html)
    soup = bf(rsp_html, 'html.parser')
    # print(soup)
    # lists = soup.select_one('#list')
    # print(lists)
    lists = soup.select('div > dl > dd > a')

    print(lists)  # pat = re.compile(r'第二章:千梅(1')
    # print(pat.findall(rsp_html))
    print(len(lists))
    url_dict_list = []
    for url in lists:
        url_dict = {}
        url_dict[url.text] = url.get('href')
        url_dict_list.append(url_dict)
    with open('url.json', 'w') as f:
        json.dump(url_dict_list, f)
    print(url_dict_list)


# get_all_url()
Пример #14
0
async def main(url_passed):
    list_of_links = scrap_content(url_passed)

    count = 0

    for links in list_of_links:
        site_files = requests.get(f'{url}/{links}').text

        soup_content = bf(site_files, 'lxml')

        page_title = soup_content.find('h1')
        content = soup_content.find('article', class_='content')

        file_name = links.replace('.shtml', '/').split('/')

        file_name = file_name[len(file_name) - 2].replace('-', '_')

        count += 1

        with open(os.path.join(path, f'{file_name}.html'),
                  'w',
                  encoding="utf-8") as fs:
            fs.write(str(page_title).replace("\n", "").replace("  ", ""))
            fs.write(str(content))

    print(count)
    print("All the pages has been crawled successfully!")
Пример #15
0
def symbol_statics_parser(ticker):
    print("start " + ticker)
    staticmap = {}
    url = "https://finance.yahoo.com/quote/%s/key-statistics?p=%s" % (ticker,
                                                                      ticker)
    response = requests.get(url, verify=False)
    sp = bf(response.text, features="lxml")
    tables = sp.find_all("table")
    for tb in tables:
        rows = tb.findChildren(['th', 'tr'])
        for row in rows:
            cells = row.findChildren('td')
            for cell in cells:
                for s in cell.strings:
                    if s is not None and "Forward Annual Dividend Yield" in s:
                        staticmap["Forward Annual Dividend Yield"] = cells[
                            len(cells) - 1].string
                    if s is not None and "Payout Ratio" in s:
                        staticmap["Payout Ratio"] = cells[len(cells) -
                                                          1].string
                    if s is not None and "52 Week High" in s:
                        staticmap["52 Week High"] = cells[len(cells) -
                                                          1].string
                    if s is not None and "52 Week Low" in s:
                        staticmap["52 Week Low"] = cells[len(cells) - 1].string
                    if s is not None and "Diluted EPS (ttm)" in s:
                        staticmap["Diluted EPS (ttm)"] = cells[len(cells) -
                                                               1].string
                    if s is not None and "Ex-Dividend Date" in s:
                        staticmap["Ex-Dividend Date"] = cells[len(cells) -
                                                              1].string
    return staticmap
Пример #16
0
def WeiBo_page(index=2, *args):
    browser = Browser_Driver()
    # 通过args区分搜索关键字

    if not args:
        for i in range(index):
            try:
                time.sleep(3)
                el = browser.find_element_by_xpath(
                    '//*[contains(text(),"正在加载中,请稍候...")]')
                time.sleep(2)
                browser.execute_script("arguments[0].scrollIntoView();", el)
                time.sleep(2)
            except Exception as e:
                time.sleep(3)
                el = browser.find_element_by_xpath(
                    '//*[contains(text(),"正在加载中,请稍候...")]')
                time.sleep(2)
                browser.execute_script("arguments[0].scrollIntoView();", el)
                time.sleep(2)
    else:
        key = args[0]
        browser.find_element_by_xpath(
            '//*[@id="weibo_top_public"]/div/div/div[2]/input').send_keys(key)
        browser.find_element_by_xpath(
            '//*[@id="weibo_top_public"]/div/div/div[2]/a').click()

    weibo_pages = bf(browser.page_source, 'lxml')
    browser.quit()
    return weibo_pages
Пример #17
0
def Get_Article_PO(PO_Article_Url):
    PO_Aricle = urlopen(PO_Article_Url)

    PO_Aricle_Obj = bf(PO_Aricle.read().decode('GB2312', 'ignore'),
                       'html.parser')

    print('<h1 class=\"print\">',
          PO_Aricle_Obj.find_all('h1')[1].string,
          '</h1>',
          file=fp)
    print('<hr>', file=fp)
    print("<div class=\"author no-print\">",
          PO_Aricle_Obj.find('div', class_='author cf').string,
          "</div>",
          "\n\n",
          file=fp)
    print("<a class=\"no-print\" href=\"",
          PO_Article_Url,
          "\">",
          "原文链接",
          "</a>",
          "\n\n",
          file=fp)

    for i in PO_Aricle_Obj.find_all('p'):
        if (i.string):
            print(i, file=fp)
    print('<br>', file=fp)
Пример #18
0
 def __init__(self, url):
     self.url = url
     self.reponse = requests.get(url)
     self.soup_page = bf(self.reponse.text, 'lxml')
     self.collection_equipe = CollectionEquipe()
     self.lst_compet = ['France : Ligue 1', 'France : Ligue 2','Angleterre : Premier League', 'Angleterre : League Championship','Espagne : Liga BBVA', 'Italie : Serie A', 'Allemagne : Bundesliga', 'Portugal : Liga Sagres','Belgique : Pro League', 'Pays-Bas : Eredivisie']
     self.lst_index = []
Пример #19
0
 def crawler(self):
     for i in range(82900, 83300):
         r = requests.get(self.input_url + str(i) + ".htm")
         soup = bf(r.text, "lxml")
         info = soup.find("div", id="shop_info").findAll("div")
         if len(info) == 1:
             continue
         data = {}
         print(i)
         try:
             data["店名"] = soup.find("div",
                                    id="shop_name").find("h3").text.replace(
                                        '\r',
                                        ';').replace('\n',
                                                     ';').replace('|', ';')
             data["電話"] = soup.find("div", id="shop_tel").text.replace(
                 '\r', ';').replace('\n', ';').replace('|', ';')
             data["地址"] = soup.find("div", id="shop_add").text.replace(
                 '\r', ';').replace('\n', ';').replace('|', ';')
             data["營業時間"] = soup.find(
                 "div", id="opening").find("span").text.replace(
                     '\r', ';').replace('\n', ';').replace('|', ';')
         except:
             print("go")
         self.output_pois.append(data)
Пример #20
0
def crwal():
    req = request.Request(url, headers=headers)
    handler = request.BaseHandler()
    opener = request.build_opener(handler)
    response = opener.open(req)
    # logger.info(response.read().decode('utf-8'))
    page = response.read().decode('utf-8')
    soup = bf(page, 'lxml')
    itemlist = soup.select('#J_goodsList > ul > li > div')
    totalprice = 0.0
    pricelist = []
    for item in itemlist:
        priceStr = item.find(class_='p-price').strong.get_text().replace(
            '¥', '')
        price = float(priceStr)
        pricelist.append(price)
        # print(price)
        # print(type(price))
        totalprice = totalprice + price
    pricelist.sort(reverse=True)
    averageprice = totalprice / len(itemlist)
    print(averageprice)
    # priceString = unicode(nString)
    r = RedisUtil.getredis()

    param = {
        'date': time.strftime('%Y-%m-%d', time.localtime()),
        'price': averageprice,
        'itemlist': pricelist
    }

    r.lpush('JDPrice', param)
Пример #21
0
def add_hw(title, due, time, the_class):
    session = requests.session()
    url = 'https://myhomeworkapp.com/login'
    #Add user and pass for myHomework app
    data = {'username': '******', 'password': '******'}
    hw_login = session.get(url)
    tree = html.fromstring(hw_login.text)
    data['csrfmiddlewaretoken'] = list(
        set(tree.xpath("//input[@name = 'csrfmiddlewaretoken']/@value")))[0]
    hw_login_response = session.post(url, data=data, headers=dict(referer=url))
    hw_add = session.get('https://myhomeworkapp.com/homework/add')
    soup = bf(hw_add.text, 'html.parser')
    req_class = ''
    for i in soup.find_all('option'):
        if i.text == the_class:
            req_class = i['value']
    hw_data = {
        'title': title,
        'cls': req_class,
        'type': 1,
        'due_date': due,
        'due_time': time,
        'repeats': 0,
        'save': 'Save'
    }
    tree2 = html.fromstring(hw_add.text)
    hw_data['csrfmiddlewaretoken'] = list(
        set(tree2.xpath("//input[@name = 'csrfmiddlewaretoken']/@value")))[0]
    submit_url = 'https://myhomeworkapp.com/homework/add'
    hw_submit = session.post(submit_url,
                             data=hw_data,
                             headers=dict(referer=submit_url))
    print(hw_submit)
    return 0
Пример #22
0
def info_scraper(imdb_id):
    title_url = 'https://www.imdb.com/title/'
    res = requests.get(url=title_url + imdb_id).text
    soup = bf(res, 'html5lib')
    rating = float(
        soup.find(name='span', attrs={
            "itemprop": "ratingValue"
        }).text)
    x = soup.find(name='h1', attrs={'class': ''}).text
    title = x.split("\xa0")[0] + ' ' + x.split("\xa0")[1].strip()
    info = soup.find(name='div', class_='subtext').text
    duration = info.split("|\n")[0].strip()
    genres = info.split("|\n")[1].split("\n")[0] + info.split("|\n")[1].split(
        "\n")[1]
    yor = info.split("\n")[-2]
    img_url = soup.find(name='div',
                        class_='poster').find(name='img').attrs['src']
    hd = 'UX650_CR1,0,680,1000_AL__QL50.jpg'
    img_url = img_url[:-27] + hd  ## for heroku '27', for local '32'
    try:
        text = soup.find(name='div', attrs={
            'class': 'inline canwrap'
        }).find('p').text
        storyline = ''
        for i in text.strip().split("\n"):
            storyline = storyline + " " + i
    except:
        storyline = 'Not found any storyline for this move.'

    return [rating, title, duration, genres, yor, img_url, storyline]
Пример #23
0
def review_scrapper(imdb_id):
    reviews_df = pd.DataFrame(
        columns=['rating', 'title', 'username', 'review_date', 'review_text'])
    title_url = 'https://www.imdb.com/title/'
    res = requests.get(url=title_url + imdb_id + '/reviews').text
    soup = bf(res, 'html5lib')
    review_boxs = soup.find_all(name='div',
                                attrs={'class': 'review-container'})
    for i in review_boxs:
        try:
            rate = int(i.find_all(name='span')[1].text)
        except:
            rate = 'nan'
        title = i.find(name='a', class_='title').text.strip()
        username = i.find(name='div',
                          class_='display-name-date').find('a').text
        review_date = i.find(
            name='div',
            class_='display-name-date').find(class_='review-date').text
        review_text = i.find(name='div', class_='text show-more__control').text
        temp = {
            'rating': rate,
            'title': title,
            'username': username,
            'review_date': review_date,
            'review_text': review_text
        }
        reviews_df = reviews_df.append(temp, ignore_index=True)
    return reviews_df
Пример #24
0
def get_target(web):
    """get the target tags which contain the covid information
       of each country and region then return tags
       Note: param <web> has to be in text format or byte format, not textwraper"""
    soup = bf(web, 'lxml')
    tbody = soup.find('tbody')
    trs = tbody.find_all(name='tr', class_=False)
    return trs
Пример #25
0
def get_html_from_avito(params):
    http = urllib3.PoolManager()
    answer = ''

    avito_url = get_avito_url(params)
    r = http.request('GET', avito_url)
    soup = bf(r.data, 'html.parser')
    return soup
Пример #26
0
def get_html_from_avito():
    http = urllib3.PoolManager()
    answer = ''
    r = http.request(
        'GET',
        'https://www.avito.ru/rossiya/kvartiry/prodam?pmax=5000000&pmin=0&s_trg=4&f=549_5696-5697-5698-5699.59_13990b.497_5185b '
    )
    soup = bf(r.data, 'html.parser')
    return soup
Пример #27
0
def get_page_source(driver, url):
    try:
        driver.get(url)
    except Exception as e:
        print e
        return get_page_source(driver, url)
    else:
        pagesource = driver.page_source
        soup = bf(pagesource, 'html.parser')
        return soup
Пример #28
0
def get_twitter():
    #读取html文件
    page = bf(driver.page_source, 'html5lib')
    #搜索目标信息
    mu_mes = page.find(
        'div', {'style': re.compile('position: relative; min-height: ')})
    mes_links = mu_mes.find_all('div', {'lang': "en"})
    #输出
    for mes in mes_links:
        word.append(mes.span.get_text())
Пример #29
0
    def multi_process(self, result):

        WIDTH = 320
        HEIGHT = 640
        PIXEL_RATIO = 3.0
        UA = random.choice(user_agent_phone)

        mobileEmulation = {
            "deviceMetrics": {
                "width": WIDTH,
                "height": HEIGHT,
                "pixelRatio": PIXEL_RATIO
            },
            "userAgent": UA
        }
        options = webdriver.ChromeOptions()
        options.add_experimental_option('mobileEmulation', mobileEmulation)
        # 下面代码 不需要打开 chrome浏览器 运行 2行代码
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=options)
        # driver.get('https://mitangzhicheng.m.tmall.com/shop/shop_auction_search.htm?suid=2095767659&sort=default')
        # 下面可以弄成字典的形式  一个店铺名 一个连接
        # shop_link_list=['http://shop114249705.m.taobao.com','http://shop103991552.m.taobao.com','https://esey.m.tmall.com/','https://mitangzhicheng.m.tmall.com/','https://msuya.m.tmall.com/?shop_id=66459281']

        shopWangwang, shopId, wx_shop_url, shop_url = result
        if not self.check_repeat(shopId, self.get_date):
            myFormat('店铺【%s】在【%s】 已经保存过了,跳过' % (shopWangwang, get_date))
            return pd.DataFrame({})
        driver.get(wx_shop_url)
        myFormat('正在处理店铺【%s】' % shopWangwang, symbol='.', fillMode='right')
        # print(driver.page_source)
        try:
            soup = bf(driver.page_source)
            fans_counter = soup.select('.collect-counter')[0].text.strip()
            temp = round(
                float(re.search(r'(\d+\.*\d*)', fans_counter).group(1)), 3)
            fans_counter_num = int(temp *
                                   10000) if '万' in fans_counter else int(temp)
        except:
            # print(driver.page_source)
            # fans_counter_num=soup.select('div[data-role="shop_head"] span')
            # print(fans_counter_num)

            fans_counter = -1
            fans_counter_num = -1
        print(shopWangwang, fans_counter)  # 33.3万
        data_list = [[
            self.get_date, shopWangwang, fans_counter, fans_counter_num,
            shop_url, shopId, OPERATOR_NAME
        ]]
        data_dict = dict(zip(range(len(data_list)), data_list))
        df = pd.DataFrame(data_dict)
        driver.close()
        return df.T
Пример #30
0
def obj_url(url):
    obj = requests.get(
        url=url,
        # proxies={'http',''},
        headers={
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
        })
    obj.encoding = 'gb2312'
    soup = bf(obj.text, 'lxml')
    return soup
Пример #31
0
def scrapeBook(url):
    url = f"http://books.toscrape.com/catalogue/{url}"
    html = requests.get(url)

    control = bf(html.content, 'html.parser')
    title = control.select(".product_main h1")[0].get_text()
    price = control.select(".product_main p.price_color")[0].get_text()
    stock = control.select(".product_main .instock")[0].get_text().strip(
    ).split(' ')[2].split('(')[1]
    dic = {'title': title, 'price': price, 'stock': stock}
    return dic
Пример #32
0
def index():
	if request.method == "POST":
		if 'username' in session:
			url = request.form["url"]
			links = db.links
			if not validators.url(url):
				url = "http://" + url
			if not validators.url(url):
			    return render('form.html', error='URL is incorrect')
			else:
				existing_url = links.find_one({'url': url})
				if not existing_url:
					current_time = str(datetime.now())

					print current_time
					print url

					cur_user = db.users.find_one({'name': session['username']})

					html = None

					try:
						html = urllib2.urlopen(url)
						html = html.read()
						soup = bf(html)
						title = url
						try:
							title = soup.find('title').text
						except Exception:
							pass
						
						db.links.insert({
							'url': url, 
							'title': title,
							'author': cur_user['name'],
							'author_id': cur_user['_id'],
							'current_time': current_time,
							'votes': 1
							})

						return render('form.html', error="New item is added")

					except Exception:
						return render('form.html', error="URL is incorrect")

				else:
					return render('form.html', error="URL already exists")
		else:
			flash('Please log in')
			redirect(url_for('login'))

	return render('form.html')
Пример #33
0
start_time = time.time()

if(len(sys.argv) == 1):
    sys.exit(0)
else:
    if(sys.argv[1]=='crawl'):
        url = "https://www.ptt.cc/bbs/beauty/index.html"
        base = "https://www.ptt.cc"
        data=[]
        key = [False,False,False]
        while(1):
            #use url to fetch new page
            r = requests.get(url)
            #time.sleep(0.01)
            soup = bf(r.text.encode('utf-8'),"html.parser")
            #follow up is the data that need to be save
            
            #first we get the next page url
            info = soup.find_all(class_="btn-group btn-group-paging")[0].find_all(class_="btn wide")
            
            url = base + info[1]['href']
            #print(url)

            #end loop if in the first page
            if( len(info) == 3 and '下頁' in info[1].text):
                break

            #second we parse the data
            word = soup.find_all(class_="r-ent")
            word.reverse()