示例#1
0
 def testFileConnector(self):
     data = {}
     model = "trip_review_detail"
     webconfigs = config.load_pages("../config/tripadvisor.json")
     web_config = config.get_model_config(webconfigs, model)
     test_page = "http://www.tripadvisor.com.sg/ShowUserReviews-g294265-d2516429-r165190313-Auld_Alliance-Singapore.html#REVIEWS"
     html = urllib2.urlopen(test_page).read()
     data, links = parser.parse_page(html, web_config)
     for edata in data:
         database.save_data(edata, web_config["model"], web_config)
     self.assertEqual(1, 1)
     self.assertTrue(True)
示例#2
0
    def testParser_trip_review_detail(self):
        pages_config = config.load_pages("../config/tripadvisor.json")
        model = "trip_review_detail"
        page_config = config.get_model_config(pages_config, model)

        test_page = "http://www.tripadvisor.com.sg/ShowUserReviews-g294265-d2516429-r165190313-Auld_Alliance-Singapore.html"
        test_page = "http://www.tripadvisor.com.sg/ShowUserReviews-g294265-d2516429-r139369230-Auld_Alliance-Singapore.html#REVIEWS"
        html = urllib2.urlopen(test_page).read()
        data, links = parser.parse_page(html, page_config)
        print data
        print links
        print len(data)
        print len(links)
示例#3
0
def start(config_dir):
    """ Start all the process
    """
    websites = config.load_websites(os.path.join(config_dir, "websites.json"))

    for website in websites:
        webconfigs = config.load_pages(os.path.join(config_dir, website["config"]))
        for webconfig in webconfigs:
            if not database.check_table(webconfig["model"]):
                database.initialize_table(webconfig["model"])
        #get_data_link.delay(website["start_url"], website["start_model"], webconfigs)
        get_data_link(website["start_url"], website["start_model"], webconfigs)
    pass
示例#4
0
    def testParser_hungry_list(self):
        pages_config = config.load_pages("../config/hungrygowhere.json")
        model = "hungrygowhere_review_list"
        page_config = None
        for webpage in pages_config:
            for k, v in webpage.iteritems():
                if k == "model" and v == model:
                    page_config = webpage

        test_page = "http://www.hungrygowhere.com/reviews/"
        html = urllib2.urlopen(test_page).read()
        data, links = parser.parse_page(html, page_config)
        print links
        print len(data)
        print len(links)
示例#5
0
    def testParser_trip_review(self):
        pages_config = config.load_pages("../config/tripadvisor.json")
        model = "trip_review"
        page_config = None
        for webpage in pages_config:
            for k, v in webpage.iteritems():
                if k == "model" and v == model:
                    page_config = webpage

        test_page = "http://www.tripadvisor.com.sg/Restaurant_Review-g294265-d2516429-Reviews-Auld_Alliance-Singapore.html"
        html = urllib2.urlopen(test_page).read()
        data, links = parser.parse_page(html, page_config)
        print links
        print len(data)
        print len(links)
示例#6
0
    def testParser_trip_main(self):
        pages_config = config.load_pages("../config/tripadvisor.json")
        model = "trip_mainpage"
        page_config = None
        for webpage in pages_config:
            for k, v in webpage.iteritems():
                if k == "model" and v == model:
                    page_config = webpage

        test_page = "http://www.tripadvisor.com.sg/Hotels-g294265-Singapore-Hotels.html"
        html = urllib2.urlopen(test_page).read()
        data, links = parser.parse_page(html, page_config)
        print len(data)
        print len(links)
        self.assertEqual(len(data), 30)
        self.assertEqual(len(links), 32)
示例#7
0
    def testParser_hungry_detail(self):
        pages_config = config.load_pages("../config/hungrygowhere.json")
        model = "hungrygowhere_review_detail"
        page_config = None
        for webpage in pages_config:
            for k, v in webpage.iteritems():
                if k == "model" and v == model:
                    page_config = webpage

        test_page = "http://www.hungrygowhere.com/singapore/je_crab_specialist_tampines/review/id-1f340200/"
        html = urllib2.urlopen(test_page).read()
        data, links = parser.parse_page(html, page_config)
        print links
        print data
        print len(data)
        print len(links)
示例#8
0
 def testLoadWebPages(self):
     webpages = config.load_pages("./skyscanner.json")
     model = "skyscanner_flight"
     page_config = None
     for webpage in webpages:
         for k, v in webpage.iteritems():
             if k == "model" and v == model:
                 page_config = webpage
     test_page = "http://www.skyscanner.net/flights-to/cheap-flights-to-cities-all.html?letter=a"
     html = urllib2.urlopen(test_page).read()
     h = fromstring(html)
     for link in page_config["links"]:
         selector = CSSSelector(link["selector"])
         print link["selector"]
         els = selector(h)
         print len(els)
     pass
示例#9
0
        data.append(entity_data)

    # parse the link for next crawling job.
    baseUrl = config["baseurl"]
    for link in config["links"]:
        selector = CSSSelector(link["selector"]) 
        els = selector(h)
        for el in els:
            url = baseUrl + el.get("href")
            links.append((url, link["model"]))
        pass
    return data, links


if __name__ == "__main__":
    from WebCrawler.config import config
    config = config.load_pages("../../../tests/WebCrawler/config/skyscanner.json")
    model = "skyscanner_citylist"
    page_config = None
    for webpage in config:
        for k, v in webpage.iteritems():
            if k == "model" and v == model:
                page_config = webpage
    test_page = "http://www.skyscanner.net/flights-to/cheap-flights-to-cities-all.html?letter=a"
    html = urllib2.urlopen(test_page).read()
    data, links = parse_page(html, page_config)
    print data
    print links
    pass

示例#10
0
 def testTasks(self):
     pages_config = config.load_pages("../config/skyscanner.json")
     model = "skyscanner_flight"
     test_page = "http://www.skyscanner.net/flights-to/lond/cheap-flights-to-london.html"
     tasks.get_data_link(test_page, model, pages_config)