示例#1
0
def getData():

    name_database = config['database']
    name_collection = config['collection']

    mongoClient, db = mg.init_mongo(name_database)
    data = mg.extract_news(db, name_collection)
    news = pd.DataFrame(list(data))
    logger.info("Returning data...")
    return news
示例#2
0
def crawl_newsLinks(driver, news_link):
    
    mongoClient, db = mg.init_mongo(name_database)
    
#    logger.debug("Loading %s", news_link)
    driver.get(news_link)
    logger.debug("URL Loaded!")
    
    time.sleep(random.randrange(5, 10))  
    searchResults = driver.find_elements_by_xpath("//div[@class='g']")
    next_news_link = driver.find_element_by_xpath("//td[@class='b navend'][2]/a").get_attribute("href")
    
    for searchResult in searchResults:
        
        searchData = dict()
        time.sleep(random.randrange(3, 7))  
        
        try:
            searchData['DirectLink'] = searchResult.find_element_by_xpath("./div/div/h3/a").get_attribute("href")
            searchData['Title'] = searchResult.find_element_by_xpath("./div/div/h3/a").text
        except NoSuchElementException:
            logger.error("Unable to find News directlink")
            continue
        
        try:
            date = searchResult.find_element_by_xpath("./div/div/div/span[3]").text
            searchData['Date'] = ut.date_parser(date)
        except NoSuchElementException:
            logger.warn("No date found")
            searchData['Date'] = None 
            
        try:
            summary = searchResult.find_element_by_xpath("./div/div/div[2]").text
            searchData['Summary'] = summary.replace(u"\u2018", "").replace(u"\u2019", "").replace(u"\u201c","").replace(u"\u201d", "").replace('"', "")
        except NoSuchElementException:
            logger.warn("No summary found")
            searchData['Summary'] = None
            
        mg.insert_doc(db, name_collection, searchData)
#        logger.debug("Data: %s \n", searchData)
    return next_news_link
示例#3
0
if __name__ == "__main__":

    logger.debug("Boilerpipe Crawler started")

    config = {}
    execfile(
        "C:\\Users\\user\\Documents\\spyderprojects\\selenium_project\\configFile.conf",
        config)

    no_crawling_list = ['forbes', 'techcrunch']

    name_database = config['database']
    name_collection = config['collection']

    mongoClient, db = mg.init_mongo(name_database)

    data = mg.extract_crawled_links(db, name_collection)

    for d in data:
        time.sleep(random.randrange(5, 9))
        logger.info("Title: %s", d['Title'])
        logger.info("Link: %s", d['DirectLink'])
        skip = False
        keepCrawling = True
        retry = 0

        for not_crawling in no_crawling_list:
            if not_crawling in d['DirectLink']:
                logger.warn("DirectLink is in not crawling list (%s) ...",
                            not_crawling)