def getData(): name_database = config['database'] name_collection = config['collection'] mongoClient, db = mg.init_mongo(name_database) data = mg.extract_news(db, name_collection) news = pd.DataFrame(list(data)) logger.info("Returning data...") return news
def crawl_newsLinks(driver, news_link): mongoClient, db = mg.init_mongo(name_database) # logger.debug("Loading %s", news_link) driver.get(news_link) logger.debug("URL Loaded!") time.sleep(random.randrange(5, 10)) searchResults = driver.find_elements_by_xpath("//div[@class='g']") next_news_link = driver.find_element_by_xpath("//td[@class='b navend'][2]/a").get_attribute("href") for searchResult in searchResults: searchData = dict() time.sleep(random.randrange(3, 7)) try: searchData['DirectLink'] = searchResult.find_element_by_xpath("./div/div/h3/a").get_attribute("href") searchData['Title'] = searchResult.find_element_by_xpath("./div/div/h3/a").text except NoSuchElementException: logger.error("Unable to find News directlink") continue try: date = searchResult.find_element_by_xpath("./div/div/div/span[3]").text searchData['Date'] = ut.date_parser(date) except NoSuchElementException: logger.warn("No date found") searchData['Date'] = None try: summary = searchResult.find_element_by_xpath("./div/div/div[2]").text searchData['Summary'] = summary.replace(u"\u2018", "").replace(u"\u2019", "").replace(u"\u201c","").replace(u"\u201d", "").replace('"', "") except NoSuchElementException: logger.warn("No summary found") searchData['Summary'] = None mg.insert_doc(db, name_collection, searchData) # logger.debug("Data: %s \n", searchData) return next_news_link
if __name__ == "__main__": logger.debug("Boilerpipe Crawler started") config = {} execfile( "C:\\Users\\user\\Documents\\spyderprojects\\selenium_project\\configFile.conf", config) no_crawling_list = ['forbes', 'techcrunch'] name_database = config['database'] name_collection = config['collection'] mongoClient, db = mg.init_mongo(name_database) data = mg.extract_crawled_links(db, name_collection) for d in data: time.sleep(random.randrange(5, 9)) logger.info("Title: %s", d['Title']) logger.info("Link: %s", d['DirectLink']) skip = False keepCrawling = True retry = 0 for not_crawling in no_crawling_list: if not_crawling in d['DirectLink']: logger.warn("DirectLink is in not crawling list (%s) ...", not_crawling)