def testFileConnector(self): data = {} model = "trip_review_detail" webconfigs = config.load_pages("../config/tripadvisor.json") web_config = config.get_model_config(webconfigs, model) test_page = "http://www.tripadvisor.com.sg/ShowUserReviews-g294265-d2516429-r165190313-Auld_Alliance-Singapore.html#REVIEWS" html = urllib2.urlopen(test_page).read() data, links = parser.parse_page(html, web_config) for edata in data: database.save_data(edata, web_config["model"], web_config) self.assertEqual(1, 1) self.assertTrue(True)
def testParser_trip_review_detail(self): pages_config = config.load_pages("../config/tripadvisor.json") model = "trip_review_detail" page_config = config.get_model_config(pages_config, model) test_page = "http://www.tripadvisor.com.sg/ShowUserReviews-g294265-d2516429-r165190313-Auld_Alliance-Singapore.html" test_page = "http://www.tripadvisor.com.sg/ShowUserReviews-g294265-d2516429-r139369230-Auld_Alliance-Singapore.html#REVIEWS" html = urllib2.urlopen(test_page).read() data, links = parser.parse_page(html, page_config) print data print links print len(data) print len(links)
def start(config_dir): """ Start all the process """ websites = config.load_websites(os.path.join(config_dir, "websites.json")) for website in websites: webconfigs = config.load_pages(os.path.join(config_dir, website["config"])) for webconfig in webconfigs: if not database.check_table(webconfig["model"]): database.initialize_table(webconfig["model"]) #get_data_link.delay(website["start_url"], website["start_model"], webconfigs) get_data_link(website["start_url"], website["start_model"], webconfigs) pass
def testParser_hungry_list(self): pages_config = config.load_pages("../config/hungrygowhere.json") model = "hungrygowhere_review_list" page_config = None for webpage in pages_config: for k, v in webpage.iteritems(): if k == "model" and v == model: page_config = webpage test_page = "http://www.hungrygowhere.com/reviews/" html = urllib2.urlopen(test_page).read() data, links = parser.parse_page(html, page_config) print links print len(data) print len(links)
def testParser_trip_review(self): pages_config = config.load_pages("../config/tripadvisor.json") model = "trip_review" page_config = None for webpage in pages_config: for k, v in webpage.iteritems(): if k == "model" and v == model: page_config = webpage test_page = "http://www.tripadvisor.com.sg/Restaurant_Review-g294265-d2516429-Reviews-Auld_Alliance-Singapore.html" html = urllib2.urlopen(test_page).read() data, links = parser.parse_page(html, page_config) print links print len(data) print len(links)
def testParser_trip_main(self): pages_config = config.load_pages("../config/tripadvisor.json") model = "trip_mainpage" page_config = None for webpage in pages_config: for k, v in webpage.iteritems(): if k == "model" and v == model: page_config = webpage test_page = "http://www.tripadvisor.com.sg/Hotels-g294265-Singapore-Hotels.html" html = urllib2.urlopen(test_page).read() data, links = parser.parse_page(html, page_config) print len(data) print len(links) self.assertEqual(len(data), 30) self.assertEqual(len(links), 32)
def testParser_hungry_detail(self): pages_config = config.load_pages("../config/hungrygowhere.json") model = "hungrygowhere_review_detail" page_config = None for webpage in pages_config: for k, v in webpage.iteritems(): if k == "model" and v == model: page_config = webpage test_page = "http://www.hungrygowhere.com/singapore/je_crab_specialist_tampines/review/id-1f340200/" html = urllib2.urlopen(test_page).read() data, links = parser.parse_page(html, page_config) print links print data print len(data) print len(links)
def testLoadWebPages(self): webpages = config.load_pages("./skyscanner.json") model = "skyscanner_flight" page_config = None for webpage in webpages: for k, v in webpage.iteritems(): if k == "model" and v == model: page_config = webpage test_page = "http://www.skyscanner.net/flights-to/cheap-flights-to-cities-all.html?letter=a" html = urllib2.urlopen(test_page).read() h = fromstring(html) for link in page_config["links"]: selector = CSSSelector(link["selector"]) print link["selector"] els = selector(h) print len(els) pass
data.append(entity_data) # parse the link for next crawling job. baseUrl = config["baseurl"] for link in config["links"]: selector = CSSSelector(link["selector"]) els = selector(h) for el in els: url = baseUrl + el.get("href") links.append((url, link["model"])) pass return data, links if __name__ == "__main__": from WebCrawler.config import config config = config.load_pages("../../../tests/WebCrawler/config/skyscanner.json") model = "skyscanner_citylist" page_config = None for webpage in config: for k, v in webpage.iteritems(): if k == "model" and v == model: page_config = webpage test_page = "http://www.skyscanner.net/flights-to/cheap-flights-to-cities-all.html?letter=a" html = urllib2.urlopen(test_page).read() data, links = parse_page(html, page_config) print data print links pass
def testTasks(self): pages_config = config.load_pages("../config/skyscanner.json") model = "skyscanner_flight" test_page = "http://www.skyscanner.net/flights-to/lond/cheap-flights-to-london.html" tasks.get_data_link(test_page, model, pages_config)