示例#1
0
def _news_scrapper():
    for n in config()['news_sites']:
        news_site = config()['news_sites'][n]

        host = news_site['url']
        homepage = news.HomePage(news_site, host)

        total = len(homepage.article_links)
        index = 1
        for link in homepage.article_links:
            article = _fetch_article(news_site, host, link)

            if article and article.title is not None:
                if (db.news.find_one({"title": article.title}) is None):
                    db.news.insert_one({
                        "title": article.title,
                        "content": article.body,
                        "category": article.category,
                        "image": build_link(host, article.image),
                        "date": datetime.datetime.utcnow()
                    })
                    progress(
                        index, total, 'Num of articles: {}'.format(
                            db.news.count_documents({})))
                else:
                    progress(index, total, 'Article already exists!')
            index += 1
        client.close()
示例#2
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logging.info('Beginning scraper for {}'.format(host))
    homepage = news.HomePage(news_site_uid, host)

    for link in homepage.article_links:
        print(link)
示例#3
0
def main(news_site_id):
    url = config()['news_sites'][news_site_id]['url']
    logging.info(f'Starting to scrap {url}')
    homepage = news.HomePage(news_site_id, url)

    logging.info(f'Getting header links')
    header_links = [_build_link(url, link) for link in homepage.header_links]

    logging.info(f'Getting news pages links')
    sections = [
        news.SectionPage(news_site_id, section) for section in header_links
    ]
    links = [section.article_links for section in sections]
    flat_links = [
        _build_link(url, item) for sublist in links for item in sublist
    ]

    logging.info('Fetching Articles')
    #articles = [news.ArticlePage(news_site_id,article) for article in flat_links]
    # _save_articles(news_site_id, flat_links)
    articles = [
        _fetch_article(news_site_id, url, article) for article in flat_links
    ]
    articles = [article for article in articles if article]

    logging.info(f'Saving articles')
    _save_articles(news_site_id, articles)
    logging.info(f'Articles saved')
示例#4
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info('Beginning scraper for {}'.format(host))
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    ud_programs = []

    if news_site_uid == 'udistrital':
        for info in homepage.udistrital_info:
            ud_info_programs = news.UdProgramsPage(news_site_uid, info)

            if ud_info_programs:
                logger.info('Info fetched!')
                ud_programs.append(ud_info_programs)
                # break

            _save_ud_programs(news_site_uid, ud_programs)
    else:
        for link in homepage.article_links:
            article = _fetch_article(news_site_uid, host, link)

            if article:
                logger.info('Article fetched!')
                articles.append(article)
                break

        _save_articles(news_site_uid, articles)
示例#5
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info('Beginning scraper for {}'.format(host))
    homepage = news.HomePage(news_site_uid, host)

    #categorias

    #articulos

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Estoy vivo!!!')
            articles.append(article)
            #print(article.title) #quitamos print para correr el codigo
            #break
    #print(len(article))#quitamos print para correr el codigo


#codigo para crear el archivo

    _save_articles(news_site_uid, articles)
示例#6
0
def _fetch_links(news_site_uid, host):
    logger.info('Start fetching links at {}'.format(news_site_uid))
    links = []
    try:
        homepage = news.HomePage(news_site_uid, host)
        links = homepage.article_links
    except Exception as e:
        logger.error('ERROR fetching links: {}'.format(e), exc_info=False)
    return links
示例#7
0
def _new_scraper (new_site_uid):
    host = config()['news_sites'][new_site_uid]['url']
    logging.info('esta escaneando {}'.format(host))
    homepage = news.HomePage(new_site_uid,host)
    articles = []
    for link in homepage.article_links:
        article = _fetch_article(new_site_uid,host,link)
        if article:
            logger.info('encontro el ariculo')
            articles.append(article)
    _save_articles(new_site_uid,articles)
示例#8
0
async def _fetch_links(news_site_uid, session):
    error = 0
    links = []
    try:
        homepage = news.HomePage(news_site_uid)
        await homepage.visit(session)
        links = homepage.article_links
    except Exception as e:
        logger.error('ERROR fetching links: {}'.format(e), exc_info=False)
        error = 'ERROR fetching links: {}'.format(e)
    return (error, links, news_site_uid)
示例#9
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logger.info('Beginning scrapper for {}'.format(host))
    homepage = news.HomePage(news_site_uid, host)
    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)
        if article:
            logger.info('Article fetched!!')
            articles.append(article)
    _save_articles(news_site_uid, articles)
示例#10
0
def _news_scraper(news_site_id):
    host = config()["news_sites"][news_site_id]["url"]

    logger.info(f"Beginning scraper for {host=}")
    homepage = news.HomePage(news_site_id, host)
    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_id, host, link)
        if article:
            logger.info("Article fetched!!")
            articles.append(article)

    _save_articles(news_site_id, articles)
def _news_scrapper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logging.info('Starting scrapper for {}'.format(host))
    homepage = news.HomePage(news_site_uid, host)
    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched correctly.')
            articles.append(article)
        # print(article.title)
    #print(f'We got {len(articles)} articles')
    _save_articles(news_site_uid, articles)
示例#12
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info(f'Beginning scrape for {host}')
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        #print(link)
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched!!')
            articles.append(article)
示例#13
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logging.info(f'Starting Scrapper for {host}...')
    articles = []

    homepage = news.HomePage(news_site_uid, host)
    for link in homepage.article_links:
        fetch_link = build_link(host, link)
        article = _fetch_article(news_site_uid, host, fetch_link)
        if article:
            articles.append(article)
            logger.info(f'Getting {fetch_link} [{article.title}]')
    logger.info(f'{len(articles)} articles were fetched!...')
    logger.info(f'Writing data...')
    _save_articles(news_site_uid, articles)
示例#14
0
def _news_scraper(news_site_uid):  # news_site_uid porque estamos utilizando las llaves como IDs {xataka, enter.co}
    host = config()['news_sites'][news_site_uid]['url']  # {https://www.xataka.com/, https://www.enter.co/}

    logging.info('Beginning scraper for {}'.format(host))
    homepage = news.HomePage(news_site_uid, host)  # contiene un set con los links del homepage

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched!!')
            articles.append(article)
            #break  # hace que se rompa el ciclo solamente cuando tengamos el primer articulo

    _save_articles(news_site_uid, articles)
示例#15
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info(f'Beginning scraper for {host}')
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched!!')
            articles.append(article)  #Va a guardar objetos de la subclase
    # print(articles)

    _save_articles(news_site_uid, articles)
示例#16
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info(f'Iniciando scrape para {host}')
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Articulo Extraido!!')
            articles.append(article)

    #print(len(articles))
    _save_articles(news_site_uid, articles)
示例#17
0
def _financial_scrapper(financial_site_uid):
    host = config()['financial_sites'][financial_site_uid]['url']

    logging.info(f'Beginning scraper for {host}')
    logging.info('Finding links in homepage...')
    homepage = news.HomePage(financial_site_uid, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(financial_site_uid, host, link)

        if article:
            logger.info('Article fetched!!!')
            articles.append(article)


    _save_articles(financial_site_uid, articles)
示例#18
0
def _news_scraper(news_site):
    ''' Función para hacer un scraper a una pagina web sobre noticias.
    @param news_site: Sitio que queremos buscar según los que tengamos en config.yaml'''
    host = common.config()['news_sites'][news_site]['url']

    logging.info('Beginning scraper for {}'.format(host))
    homepage = news.HomePage(news_site, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site, host, link)

        if article:
            logger.info('Article ferched!!')
            articles.append(article)

    _save_article(news_site, articles)
示例#19
0
def _news_scrapper(
    news_site_uid
):  # config carga la estructura del yaml y aqui accedemos a url de la news site
    host = config()['news_sites'][news_site_uid]['url']

    logging.info('Beginning scrapper for {}'.format(host))  #'f'{host}'
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched!')
            articles.append(article)

    _save_articles(news_site_uid, articles)
示例#20
0
def _news_scraper(news_site_uid):
    #take the url from the config.yaml
    host = config()['news_sites'][news_site_uid]['url']
    logging.info('Beginning scraper for {}'.format(host))
    #create a news homepage object
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        #call the function that fetches the article
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched!')
            articles.append(article)
    #save info in a csv file
    _save_articles(news_site_uid, articles)
示例#21
0
def _news_scraper(news_site_uid):
    host = config()["news_sites"][news_site_uid]["url"]

    logging.info("Beginning scraper for {}".format(host))
    home_page = news.HomePage(news_site_uid, host)

    articles = []
    for link in home_page.article_links:
        article = _fetch_article(news_site_uid, link)

        if article:
            logger.info("Article fetched!!")
            articles.append(article)
            print(article.title)

    print(len(articles))
    _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid):
    """"Modulo que genera el scrapper de la pagina seleccionada"""
    host = config()['news_sites'][news_site_uid]['url']
    logging.info('Beggining scrapper for {}'.format(host))
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.articule_links:
        #print(link)
        article = _fetch_article(news_site_uid, host, link)
        if article:
            logger.info('Article fetched!')
            articles.append(article)
            #break
            #print(article.title)
    #print(len(articles))
    _save_articles(news_site_uid, articles)
示例#23
0
def _news_scrapper(news_sites_uid):
	host = config()['new_site'][news_sites_uid]['url']
	
	logging.info('begging scraper for {} '.format(host))
	homepage = news.HomePage(news_sites_uid, host)	

	for link in homepage.article_links:
		article = _fetch_article(news_sites_uid, host, link)
		
		if article:
			logger.info('Article fetched!!')
			articles.append(article)
			print(article.title)

	print(len(article))

	_save_articles(news_sites_uid, articles)
示例#24
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logger.info(f"Beginning scraper for {host}")
    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info(Fore.GREEN +
                        '📥📥📥 Article Fetched! 📥📥📥')
            #Colorama
            print(Style.RESET_ALL)

            articles.append(article)

    _save_articles(news_site_uid, articles)
示例#25
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info(f'Comenzando el scraper de: {host}')
    logging.info('Encontrando links en la página de inicio...')

    homepage = news.HomePage(news_site_uid, host)

    articles = []

    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Articulo obtenido de forma exitosa!!')
            articles.append(article)

    _save_articles(news_site_uid, articles)
示例#26
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info('Beginning scraper for {}'.format(host))
    logging.info('Finding links in homepage...')

    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched!')
            articles.append(article)
            print(article.title)

    print(len(articles))
示例#27
0
def _news_scraper(news_site_uid):
    print(news_site_uid)
    host_url = config()['news_sites'][news_site_uid]['url']
    host_url2 = config()['news_sites'][news_site_uid]['url2']

    logger.info(f'\tBegginig scraper for {host_url}')

    # 1. Go to main page and get all the tech link articles
    homepage = news.HomePage(news_site_uid, host_url)

    articles = []  # list for save tech articles
    for link in homepage.article_links:
        #print(link)
        article = _fetch_article(news_site_uid, host_url2, link)
        if article:
            logger.info('Article fetched!')
            articles.append(article)

    _save_article(news_site_uid, articles)
示例#28
0
def _news_scrapper(news_site_uid):
    #url
    host = config()['news_sites'][news_site_uid]['url']
    logging.info('Iniciando scrapper para {}'.format(host))

    homepage = news.HomePage(news_site_uid, host)
    articles = []
    print("homepage {}".format(homepage.article_links))
    
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched')
            articles.append(article)
            print(article.title)
        print(len(articles))
   
    _save_articles(news_site_uid, articles)
示例#29
0
def _news_scraper(news_site_uid):
    #obtenemos la direccion url
    host = config('config.yaml')['news_sites'][news_site_uid]['url']
    logging.info('Beginning scraper for {}'.format(host))

    #Envia el nombre del site y la url a la clase HomePage
    homepage = news.HomePage(news_site_uid, host)
    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host,
                                 link)  #Rectifica el link de cada articulo
        if article:
            logger.info('Article fetched!!')
            #Enlista los articulos encontrados
            articles.append(article)
            #print(article.title)
        #if len(articles)==3:
        #	break
    #Guarda el articulo
    _save_articles(news_site_uid, articles)
示例#30
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info(f'Iniciando scrape para {host}')

    homepage = news.HomePage(news_site_uid, host)

    for link in homepage.article_links:
        print(link)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched')
            articles.append(article)
            print(article.title)

    print(len(articles))