예제 #1
0
def news_from_link(ref_link, news_from_globo):
    row = {
        'titulos': [],
        'links': [],
        'noticia': [],
        'image': [],
        'abstract': [],
        'date': []
    }

    article = NewsPlease.from_url(ref_link)
    if (article is not None):
        # Data returned by the NewsPlease
        row['titulos'].append(article.title)
        row['noticia'].append(article.text)
        row['abstract'].append(article.text)
        row['links'].append(article.url)

        if (news_from_globo):
            # we need to get the date from the original url, the date returned by the NewsPlease is wrong
            page_time = urllib.request.urlopen(article.url)
            soup_date = BeautifulSoup(page_time, 'html.parser')
            time_tag = soup_date.find_all('time',
                                          attrs={'itemprop': 'datePublished'})
            public_date = time_tag[0].text
            formated_date = format_globo_date(public_date)
            row['date'].append(formated_date)
        else:
            formated_date = str(article.date_publish)
            row['date'].append(formated_date)

        path_image = article.image_url

        if path_image == '' or path_image == None:
            row['image'].append(0)
        else:
            row['image'].append(download_and_move_image(article.image_url))

        news = News(row['abstract'], row['noticia'], row['date'], row['links'],
                    row['titulos'], row['image'])

        try:
            print(row['titulos'])
            news_in_db = seguranca_table.check_news(news)
            print('news_in_db: ' + str(news_in_db))

            if (not news_in_db):
                row = pd.DataFrame(row)
                df, categories = seguranca_lexical.lexical_corpus_and_title(
                    row)
                print(categories)

                # DB categories and image
                if (categories != [set()]):
                    news.set_categories(categories)
                    seguranca_table.save_news(news)
                    seguranca_post.post_news(df)

        except:
            print('Empty News')
예제 #2
0
def social_news_from_link(ref_link):
    row = {'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': [], 
           'fb_comment': [], 'fb_share': [], 'fb_reaction': [], 'fb_total': []}
    
    article = NewsPlease.from_url(ref_link)
    if (article is not None):
        # Data returned by the NewsPlease
        row['titulos'].append(article.title)
        row['noticia'].append(article.text)
        row['links'].append(article.url)
        row['abstract'].append(article.text)
        formated_date = str(article.date_publish)
        row['date'].append(formated_date)
        path_image = article.image_url
        if path_image == '' or path_image == None:
            row['image'].append(0)
        else:
            row['image'].append(download_and_move_image(article.image_url))
        
        fb_comment, fb_share, fb_reaction, fb_total = util.get_sharedcount_info(article.url)

        row['fb_comment'].append(fb_comment)
        row['fb_share'].append(fb_share)
        row['fb_reaction'].append(fb_reaction)
        row['fb_total'].append(fb_total)
    
        social_news = Social_News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image'],
                                    row['fb_comment'], row['fb_share'], row['fb_reaction'], row['fb_total'])
        
        try:
            print(row['titulos'])
            news_in_db = midia_table.check_news(social_news)
            print('news_in_db: ' + str(news_in_db))
            
            if(not news_in_db):
                row = pd.DataFrame(row)
                df, categories = midia_lexical.lexical_corpus_and_title(row)
                
                # DB categories and image
                if(categories != [set()]):
                    social_news.set_categories(categories)
                    midia_table.save_news(social_news)
                    midia_post.post_news(df)
                    
        except:
            print('Empty News')
예제 #3
0
def news_from_link(ref_link):
    row = {
        'titulos': [],
        'links': [],
        'noticia': [],
        'image': [],
        'abstract': [],
        'date': []
    }
    article = NewsPlease.from_url(ref_link)
    if (article is not None):
        row['titulos'].append(article.title)
        row['noticia'].append(article.text)
        row['links'].append(article.url)
        row['abstract'].append(article.text)
        #         if(article.date_publish == None):
        if ((article.date_publish == None)
                or (article.date_publish > datetime.datetime.now())):
            row['date'].append(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        else:
            row['date'].append(article.date_publish)
        path_image = article.image_url
        print(path_image)
        if path_image == '' or path_image == None:
            row['image'].append(0)
        else:
            row['image'].append(download_and_move_image(article.image_url))
        news = News(row['abstract'], row['noticia'], row['date'], row['links'],
                    row['titulos'], row['image'])
        try:
            print(row['titulos'])
            news_in_db = pessoas_table.check_news(news)
            print('news_in_db: ' + str(news_in_db))
            if (not news_in_db):
                row = pd.DataFrame(row)
                df, categories = pessoas_lexical.lexical_corpus_and_title(row)
                # DB categories
                if (categories != [set()]):
                    news.set_categories(categories)
                    pessoas_table.save_news(news)
                    pessoas_post.post_news(df)
        except:
            print('Empty News')
예제 #4
0
for noticia in noticias:
    print(noticia.find_all('a', href=True)[0]['href'])
    article = NewsPlease.from_url(noticia.find_all('a', href=True)[0]['href'])
    row = {'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': []}
    if (article is not None):
        row['titulos'].append(article.title)
        row['noticia'].append(article.text)
        row['links'].append(article.url)
        row['abstract'].append(article.text)
        row['date'].append(article.date_publish)
        path_image = article.image_url
        print(path_image)
        if path_image == '' or path_image == None:
            row['image'].append(0)
        else:
            row['image'].append(download_and_move_image(article.image_url))
        news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image'])
        try:
            print(row['titulos'])
            news_in_db = midia_table.check_news(news)
            print('news_in_db: ' + str(news_in_db))
            if (not news_in_db):
                row = pd.DataFrame(row)
                df, categories = midia_lexical.lexical_corpus_and_title(row)
                # DB categories
                if (categories != [set()]):
                    news.set_categories(categories)
                    midia_table.save_news(news)
                    midia_post.post_news(df)
        except:
            print('Empty News')
예제 #5
0
     
     # we need to get the date from the original url, the date returned by the NewsPlease is wrong
     page_time = urllib.request.urlopen(news_url)
     soup_date = BeautifulSoup(page_time, 'html.parser')
     time_tag = soup_date.find_all('time', attrs={'itemprop': 'datePublished'})
     public_date = time_tag[0].text 
     formated_date = format_date(public_date)
     
     row['titulos'].append(titulo)
     row['links'].append(news_url)
     row['date'].append(formated_date)
     row['noticia'].append(noticia)
     row['abstract'].append(noticia)
     if(image_url is not None):
         path_image = image_url
         row['image'].append(download_and_move_image(path_image))
     else:
         row['image'].append(0)
         
     news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image'])
 
     try:
         print(row['titulos'])
         news_in_db = check_news(news)
         print('news_in_db: ' + str(news_in_db))
         if(not news_in_db):
             row = pd.DataFrame(row)
             df, categories = lexical_soup_globo(row)
             # DB categories
             if(categories != [set()]):
                 news.set_categories(categories)