Пример #1
0
def parse_url(url="http://www.highya.com/"):
    page = req_main(url)
    if page:
        soup = BeautifulSoup(page)
        latest_reviews_h3 = soup.find("h3", text=re.compile("latest reviews:"))

        if latest_reviews_h3:
            post_div = latest_reviews_h3.find_next("div", attrs={"class":"clearfix like-right-col"})

            if post_div:
                all_li = post_div.find_all("li")
                all_a = [li.find("a") for li in all_li]

                for a in all_a:
                    link = "http://www.highya.com%s" % a.get("href")
                    page2 = req_main(url)

                    if page2:
                        soup2 = BeautifulSoup(page2)
                        article_tag = soup2.find("article", attrs={"class":"product-article"})

                        if article_div:
                            header = article_tag.find("header")
                            domain = "www.highya.com"
                            main_title = soup2.find("title").text()
                            main_title_link = link
                            article_div = article_tag.find("div", attrs={"class":"site-section section-article"})
                            blog_title = article_div.find("h2").text()
                            blog_link = link
                            itemtype = header.get("itemtype").split("/")[-1]
                            category = itemtype
                            cat_link = link
                            entry_content = article_div
                            entry_text = article_div.text()
Пример #2
0
    def home_page_link(self):
        self.creat_avv_blog_scrap_table()

        # r = self.req_proxy()
        # page = r.content
        # r.close()

        link_list = [
            "http://testolimitfacts.com/",
            # "http://testolimitfacts.com/testo-limit-review/",
            #   "http://testolimitfacts.com/testo-limit-review/",
            #   "http://testolimitfacts.com/slimgenix-pro/",
            #   "http://testolimitfacts.com/power-pro/",
            #   "http://testolimitfacts.com/addium-brain-enhancer-another-scam/",
            #   "http://testolimitfacts.com/enduros-male-enhancement/",
            #   "http://testolimitfacts.com/testo-xl/",
            #   "http://testolimitfacts.com/is-spartagen-xt-scam/",
            #   "http://testolimitfacts.com/elite-test-360/",
            #   "http://testolimitfacts.com/honest-green-coffee-bean-extract/",
            #   "http://testolimitfacts.com/premium-natural-garcinia-cambogia/",
            #   "http://testolimitfacts.com/maximum-shred/",
            #   "http://testolimitfacts.com/extreme-home-profits-review-worth-the-money-or-a-scam/",
            #   "http://testolimitfacts.com/30-day-change/",
            #   "http://testolimitfacts.com/100-day-loans/",
        ]

        for link in link_list:
            # page = main_req(link)
            page = req_main(link)

            if page:
                self.get_link_from_first_page(link, page)
Пример #3
0
    def home_page_link(self):
        self.creat_avv_blog_scrap_table()

        # r = self.req_proxy()
        # page = r.content
        # r.close()

        link_list = ["http://www.healthyminimarket.com",
                     # "http://www.healthyminimarket.com/page/2/",
                     # "http://www.healthyminimarket.com/page/3/",
                     # "http://www.healthyminimarket.com/page/4/",
                     # "http://www.healthyminimarket.com/page/5/",
                     # "http://www.healthyminimarket.com/page/6/",
                     # "http://www.healthyminimarket.com/page/7/",
                     # "http://www.healthyminimarket.com/page/8/",
                     # "http://www.healthyminimarket.com/page/9/",
                     # "http://www.healthyminimarket.com/page/10/",
                     # "http://www.healthyminimarket.com/page/11/",
                     # "http://www.healthyminimarket.com/page/12/",
                     # "http://www.healthyminimarket.com/page/13/",
                     # "http://www.healthyminimarket.com/page/15/",
                     # "http://www.healthyminimarket.com/page/16/"
                     ]

        for link in link_list:
            # page = main_req(link)
            page = req_main(link)

            if page:
                self.get_link_from_first_page(page)
 def open_home_page(self):
     self.creat_avv_blog_scrap_table()
     # r = self.req_proxy()
     # page = r.content
     # r.close()
     link = "http://www.healthcaresdiscussion.com"
     #page = main_req(link)
     page = req_main(link)
     if page:
         link_to_extract = self.get_all_link_home_page(page)
         map(self.get_page_next_link, link_to_extract)
    def get_page_next_link(self, link):
        sql = """SELECT * FROM avv_blog_scrap_table WHERE blog_link = '%s' """ % (self.my_strip(link))
        self.cursor.execute(sql)
        results = self.cursor.fetchall()

        if not results:
            # r2 = self.req_proxy(link=link)
            # page2 = r2.content
            # r2.close()
            #page2 = main_req(link)
            page2 = req_main(link)
            if page2:
                self.get_detail_next_page(link, page2)
    def prev_home_page(self):
        self.creat_avv_blog_scrap_table()
        link_lists = ['http://www.healthcaresdiscussion.com/page/2/',
                      'http://www.healthcaresdiscussion.com/page/3/']

        for link in link_lists:
            # r = self.req_proxy(link=link)
            # page = r.content
            # r.close()
            page = req_main(link)
            if page:
                link_to_extract = self.get_all_link_home_page(page)

                map(self.get_page_next_link, link_to_extract)