def parse(self, response):
        #print type(response)

        article = None
        try:
            article = NewsPlease.from_html(response.body.encode("utf-8"))
        except:
            article = NewsPlease.from_html(
                response.body.decode('latin-1').encode("utf-8"))
            print "EXCEPTION OCCURED"

        print article.date_publish
        #print article.text
        article2 = Article(url="", language="es")
        article2.set_html(response.body)
        article2.parse()

        print response.url
        self.db.articles_es.insert({
            "title": article.title,
            "pub_date": article.date_publish,
            "url": response.url,
            "content": article2.text,
            "raw_html": response.body
        })

        links = self.linkExtractor.extract_links(response)
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse)
def custom_crawl_articles():
    url_file = open('crawled_urlIndex/manual_urls.txt', 'r')
    urls = []
    counter = 0
    for line in url_file.readlines():
        counter += 1
        print(counter)
        try:
            cdata = {}
            article = NewsPlease.from_url(line.strip())
            purl = urlparse(line)
            cdata['src'] = str(
                purl.netloc.replace("www.", "").replace(".com", ""))
            cdata['url'] = line.strip()
            cdata['title'] = article.title.encode('utf-8').strip()
            cdata['og_title'] = article.title.encode('utf-8').strip()
            cdata['content'] = article.text.encode('utf-8').strip()
            cdata['lang'] = article.language.encode('utf-8').strip()
            adate = article.date_publish
            if adate is None:
                continue
            dateObj = datetime.datetime.strptime(str(adate),
                                                 "%Y-%m-%d %H:%M:%S")
            publisedDate = dateObj.strftime("%d %b %Y")
            cdata['dateObj'] = dateObj
            cdata['publishedDate'] = publisedDate
            urls.append(cdata)
        except:
            continue
    json_to_csv(urls)
    return jsonify(data=urls)
示例#3
0
def get_data(path, destination):
    links_list = set()
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            link = line.split()[0]
            if len(link) < 10:
                continue
            links_list.add(link)
    links_list = list(links_list)
    final_outputs = {}
    important_keys = [
        'authors', 'date_publish', 'description', 'image_url', 'language',
        'title', 'maintext'
    ]

    multiple_index = 200
    for i in tqdm(range(len(links_list) // multiple_index)):
        keys = links_list[i * multiple_index:(i + 1) * multiple_index]
        values = NewsPlease.from_urls(keys, timeout=6)
        for key, value in values.items():
            paper_data = {}
            for im_key in important_keys:
                paper_data[key] = value.__dict__[im_key]
            final_outputs[key] = paper_data
    pickle.dump(final_outputs, open(destination, 'wb'))
示例#4
0
    def read(self):
        url_file = open('news_urls.txt', 'r')
        articles = []
        i = 0

        for line in url_file:
            try:
                article = NewsPlease.from_url(line, timeout=3)
                print(article.title, file=self.title_file)
                article.text = article.text.replace('\n', '')
                print(article.text, file=self.content_file)
                articles.append(article.text)
                sleep(1)
                i += 1
                if i % 10 == 0:
                    if i != 0:
                        print(i, 'articles collected so far')
            except:
                print('failure, next article')

        #clean workspace
        print(i, 'news articles collected on', self.term, '\n')
        url_file.close()
        os.remove('news_urls.txt')

        return articles
示例#5
0
 def crawl_newsapi_fulltext(self):
     """
     crawl_newsapi_fulltext enriches existing rows in Article table that do not have 
         fulltext by going to the associated URL, scraping the site, then obtaining the 
         fulltext of the article and saving it to the database
     """
     # For article filtering against None (Null in database), "is" and "is not" does not work
     articles = Article.query.filter(
         and_(Article.article_url != None,
              Article.article_fulltext == None)).all()
     n = 1
     nmax = 4000  # number of articles to be processed at a time
     for article in articles:
         with suppress(Exception):
             newsplease_article = NewsPlease.from_url(article.article_url)
             article.article_fulltext = newsplease_article.text
             article.article_wordcount = len(
                 newsplease_article.text.split(" "))
             print(n)
             print(article.article_url)
             print(newsplease_article.title)
             print(newsplease_article.text)
             print('-----------------')
         db.session.flush()
         n = n + 1
         if n > nmax:
             break
     db.session.commit()
示例#6
0
    def handle(self, *args, **options):

        news = RSSNews(RSS_Links)
        telegraph = Telegraph(access_token=os.getenv('TELEGRAPH_ACCESS_TOKEN'))

        if news.urls:
            for url, date in news.urls.items():
                article = NewsPlease.from_url(url)

                a = Article(author=', '.join(article.authors) or 'Anonymous',
                            title=article.title,
                            short_text=article.description,
                            content=article.maintext,
                            date=date,
                            source_link=url,
                            img=article.image_url)
                a.save()

                response = telegraph.create_page(title=a.title,
                                                 html_content=a.content)

                TelegraphArticle(title=a.title, link=response['url']).save()

                bot.send_telegraph_msg(response['url'])

        self.stdout.write(self.style.SUCCESS('Success'))
示例#7
0
def detectURL():
    data = request.json
    url = data['url']
    try:
        article = NewsPlease.from_url(url)
    except:
        return jsonify([None, None])
    statement = article.title
    justification = article.description
    try:
        subject = data['subject'] if data[
            'subject'] != "" else fnd.get_subject(statement)[0]
    except:
        result = (False, statement, justification)
        return jsonify(result)


#    print(subject)

    if justification == None:
        return jsonify([None, None])
    result = fnd.detect(statement, subject, justification)
    result = result + (statement, justification, subject)
    #    print(jsonify(article))
    return jsonify(result)
示例#8
0
def classify_texts():
    requestObject = request.get_json()
    theURL = requestObject['urlOfContent']
    articleTitle = ""
    articleMaintext = ""

    try:
        article = NewsPlease.from_url(theURL, timeout=20)
        articleTitle = article.title
        articleMaintext = article.maintext
    except:
        articleTitle = ""
        articleMaintext = ""
    HF_Rating = ""
    try:
        # fakeRating = requests.get('https://huggingface.co/openai-detector/?'+articleMaintext)
        # HF_Rating = fakeRating.json()['real_probability']
        HF_Rating = inferWithHuggingFace(articleMaintext)
    except:
        HF_Rating = ""
    CML_Rating = getMLClassification(inputStr=requestObject['titleOfContent'])

    data_set = {
        "real": HF_Rating,
        "fullText": "" + articleMaintext,
        "real_CML": CML_Rating
    }
    outputJson = json.dumps(data_set)
    resp = make_response(outputJson)
    resp.headers['Access-Control-Allow-Origin'] = '*'
    resp.headers['Access-Control-Allow-Methods'] = 'DELETE, POST, GET, OPTIONS'
    resp.headers[
        'Access-Control-Allow-Headers'] = 'Content-Type, Access-Control-Allow-Headers, Authorization, X-Requested-Width'
    return resp
示例#9
0
    def parse(self, response):
        try:
            article = NewsPlease.from_html(response.body, response.url)
            text = article.maintext
            if any(x in text.lower() for x in self.keywords):
                item = ArticleItem()
                item['title'] = article.title
                item['text'] = text
                item['url'] = response.url
                print('Saved', response.url)
                yield item
        except:
            pass

        # Get all the <a> tags
        a_selectors = response.xpath("//a")
        # print('SELECTORS', a_selectors)
        # Loop on each tag
        for selector in a_selectors:
            text = selector.xpath("text()").extract_first()
            link = selector.xpath("@href").extract_first()
            if link != None:
                if 'https://' not in link:
                    link = 'https://news.dartmouth.edu%s' % link
                # print(link)
                request = response.follow(link, callback=self.parse)
                # Return it thanks to a generator
                yield request
示例#10
0
def extract_news(link):
    """This function extract news from given link.
    
    Arguments:
        link {string} -- [Link of news article.]
    
    Raises:
        ValueError: [Raise error if link is not for ekantipur/onlinekhabar]
    
    Returns:
        [tuple(title, sample_text)] -- [Title: Title of the news, sample_text: news article that has been extracted from the link given.]
    """
    if 'onlinekhabar.com' in link:
        sample_text = get_content_onlinekhabar(link)
    elif 'ekantipur.com' in link:
        sample_text = get_content_ekantipur(link)
    else:
        raise ValueError(
            'Currently we work with onlinekhabar and ekantipur only. Other sites will be addedd soon.'
        )

    article = NewsPlease.from_url(link)
    title = article.title

    return (title, sample_text)
示例#11
0
    def extract_content(self, URL):
        """
        This method returns the main content from the given URL
        """
        try:
            # Extract content
            content = Goose().extract(URL).cleaned_text

            # If the returned content is null raise an exception to change the crawler
            if (len(content) == 0):
                raise Exception

        except Exception as exception:
            highlight_back(
                "[ContentCrawler] Crawler migrated from Goose to News-Please due to an exception: {}"
                .format(exception), 'G')

            try:
                # Extract content usinf NewsPleas
                content = NewsPlease.from_url(URL).text
            except Exception as exception:
                highlight_back(
                    "[ContentCrawler] An exception has occured in News Please and Lassie method content is empty: {}"
                    .format(exception), 'R')

                #content is now empty
                content = ""

        return content
示例#12
0
def rss_view(request):
    context = {}
    blog_posts = []
    import feedparser
    from newsplease import NewsPlease

    # Get a list of feed URLs
    with open('feeds.txt') as f:
        rss_urls = list(f)
    for url in rss_urls:
        NewsFeed = feedparser.parse(url)
        for entry in NewsFeed.entries:
            blog = {}
            blog['title'] = entry.title
            blog['link'] = entry.link
            # get content
            article = NewsPlease.from_url(entry.link)
            blog['content'] = article.maintext
            blog['image'] = article.image_url
            blog['date_published'] = article.date_publish
            blog['author'] = ''
            blog['description'] = article.description
            #print("date_publish: ", article.date_publish)

            blog_posts.append(blog)

    context['blog_posts'] = blog_posts
    return render(request, 'personal/rss.html', context)
示例#13
0
def scrape(string):
    url = 'https://economictimes.indiatimes.com/topic/' + string

    # Connect to the URL
    response = requests.get(url)

    # Parse HTML and save to BeautifulSoup object¶
    soup = BeautifulSoup(response.text, "html.parser")
    articles = []

    x = soup.findAll('a')
    # To download the whole data set, let's do a for loop through all a tags
    for i in range(len(x)):  #'a' tags are for links
        one_a_tag = x[i]
        if one_a_tag.has_attr('href'):
            link = one_a_tag['href']
            if not link.startswith('http'):
                if link.startswith('/markets'):
                    download_url = 'https://economictimes.indiatimes.com' + link
                    print(download_url)
                    articles.append(NewsPlease.from_url(download_url))

    sentiments = list()

    for article in articles:
        string = article.title + '\n' + article.text
        sentiments.append(sentiment(string))

    return sentiments
 def getcontent(self):
     articles = []
     urls5 = self.geturls()
     for i in urls5:
         article = NewsPlease.from_url(i)
         articles.append(article)
     return articles
示例#15
0
def get_data(url):
    """
    Extract the data from a specific url of a news article

    url : the url of the article, that we want to extract information from_url

    Return
    article.title : the title of the article
    article.text : the text block of the article
    article.date_publish :the data the article was published
    article.description : a short description of the article
    article.language : the language in that the article is written
    article.date_modify : the date the article was modifeied if it was
    article.url : the url of the article (same as input url)

    """

    # Try downloading the article
    try:
        article = NewsPlease.from_url(url)
    # catch HTTPError and return empty values instead
    except urllib.error.HTTPError as err:
        # Print the error code
        print("HTTPError Found: ", err.code)
        return "", "", "", "", "", "", ""
    print("Data Extracted.")
    return article.title, article.text, article.date_publish, article.description, article.language, article.date_modify, article.url
示例#16
0
    def filter_record(self, warc_record, article=None):
        passed_filters, article = super().filter_record(warc_record, article)
        if not passed_filters:
            return False, article
        url = warc_record.rec_headers.get_header('WARC-Target-URI')

        def get_lang():
            nonlocal article
            if article is None:
                article = NewsPlease.from_warc(warc_record)
            return article.language

        country = detect_country(url, get_lang)
        if not country or not is_european_cc(country):
            return False, article
        article.country = country
        if article is None:
            article = NewsPlease.from_warc(warc_record)
        lang = article.language
        if not lang or not is_european_langcode(lang):
            return False, article
        # TODO: Find COVID-19 mention
        searcher = get_covid_searchers().get(lang)
        if searcher is None:
            return False, article

        def match(key):
            return searcher.match((getattr(article, key)
                                   or "").lower().encode("utf-8"))

        if match("title"):
            return True, article
        if match("maintext"):
            return True, article
        return True, article
示例#17
0
def news_from_link(ref_link, news_from_globo):
    row = {
        'titulos': [],
        'links': [],
        'noticia': [],
        'image': [],
        'abstract': [],
        'date': []
    }

    article = NewsPlease.from_url(ref_link)
    if (article is not None):
        # Data returned by the NewsPlease
        row['titulos'].append(article.title)
        row['noticia'].append(article.text)
        row['abstract'].append(article.text)
        row['links'].append(article.url)

        if (news_from_globo):
            # we need to get the date from the original url, the date returned by the NewsPlease is wrong
            page_time = urllib.request.urlopen(article.url)
            soup_date = BeautifulSoup(page_time, 'html.parser')
            time_tag = soup_date.find_all('time',
                                          attrs={'itemprop': 'datePublished'})
            public_date = time_tag[0].text
            formated_date = format_globo_date(public_date)
            row['date'].append(formated_date)
        else:
            formated_date = str(article.date_publish)
            row['date'].append(formated_date)

        path_image = article.image_url

        if path_image == '' or path_image == None:
            row['image'].append(0)
        else:
            row['image'].append(download_and_move_image(article.image_url))

        news = News(row['abstract'], row['noticia'], row['date'], row['links'],
                    row['titulos'], row['image'])

        try:
            print(row['titulos'])
            news_in_db = seguranca_table.check_news(news)
            print('news_in_db: ' + str(news_in_db))

            if (not news_in_db):
                row = pd.DataFrame(row)
                df, categories = seguranca_lexical.lexical_corpus_and_title(
                    row)
                print(categories)

                # DB categories and image
                if (categories != [set()]):
                    news.set_categories(categories)
                    seguranca_table.save_news(news)
                    seguranca_post.post_news(df)

        except:
            print('Empty News')
示例#18
0
def getNews(link):
    """
    Function to get the news for a certain URL - using library newsplease

    :param link: the URL link for the news
    :return: the content of the news for the linl provided
    """
    try:
        """
        first_article = Article(url=link)
        first_article.download()
        first_article.parse()
        text=first_article.text
        """
        article = NewsPlease.from_url(link)

        #we need to remove new lines and quotes, otherwise quilt will fail
        article_no_newlines = article.text.replace('\n', '')
        article_no_quotes = article_no_newlines.replace('"', "'")
        #article = NewsPlease.from_url(link)
        #return article.text

        return article_no_quotes
    except:
        print("An exception occurred while scrapping the news:",link)
        # traceback.print_exc()
        pass

    return None
示例#19
0
def url():
    news = []
    error = None
    if request.method == 'POST':
        # article = NewsPlease.from_url('https://economictimes.indiatimes.com/wealth/personal-finance-news/rbi-policy-why-repo-rate-cut-failed-to-cheer/articleshow/71451242.cms')
        data = request.get_json()
        print(data['url'])
        article = NewsPlease.from_url(data['url'])
        news.append({
            "authors": article.authors,
            "date_download": article.date_download,
            "date_modify": article.date_modify,
            "date_publish": article.date_publish,
            "description": article.description,
            "filename": article.filename,
            "image_url": article.image_url,
            "language": article.language,
            "localpath": article.localpath,
            "source_domain": article.source_domain,
            "text": article.text,
            "title": article.title,
            "title_page": article.title_page,
            "title_rss": article.title_rss,
            "url": article.url
        })
    return jsonify(news)
示例#20
0
def apacitationforlist(multipleurls):
    mylist = multipleurls.split(",")
    length = len(mylist)
    message = ""

    for x in range(length):
        message = message + str(x + 1) + ". "
        myurl = mylist[x]
        request = requests.get(myurl)
        if request.status_code < 400:
            article = NewsPlease.from_url(mylist[x])
            if article.authors == None or article.title == None:
                message += "There is not enough information to make a citation."
                message += "\n"
            else:
                if len(article.authors) != 0:
                    message += apacitation(article.authors[0], article.title,
                                           myurl)
                    message += "\n"
                else:
                    message += "We could not find an author."
                    message += "\n"
        else:
            message += "The website you requested is not available or does not exist."
            message += "\n"
    print(message)
    return message
示例#21
0
def url_Contents(url_article):
    article = NewsPlease.from_url(url_article)
    if (article.text) == None:
        print('None')
    else:
        content = article.text
    return content
示例#22
0
def main():
    article = NewsPlease.from_url(
        'https://www.foxnews.com/politics/house-democrat-subpoenas-mnuchin-irs-for-trumps-tax-returns'
    )
    doc = Document.from_newsplease(article)
    doc = extractor.parse(doc)
    answers = doc.get_top_answer('who').get_parts_as_text()
 def extractorFunc(self):
     extract_list = []
     with open("./crawler_urls/" + self.__filename, "r") as f:
         load_list = json.load(f)
     num_news = len(load_list)
     for i in range(num_news):
         news_dict = {}
         if self.__media in load_list[i]['media']:
             extractor = NewsPlease.from_url(load_list[i]['url'])
             news_dict["title"] = load_list[i]['title']
             news_dict["media"] = load_list[i]['media']
             news_dict["date"] = load_list[i]['date']
             news_dict["url"] = load_list[i]['url']
             maintext = extractor.maintext
             if maintext and len(maintext) > 200:
                 news_dict["text"] = maintext
             else:
                 continue
             extract_list.append(news_dict)
         else:
             continue
         if (i + 1) % 10 == 0:
             self.toJson(extract_list)
             extract_list = []
     self.toJson(extract_list)
    def filter_record(self, warc_record, article=None):
        url = warc_record.rec_headers.get_header('WARC-Target-URI')
        url_parts = tldextract.extract(url)
        domain = url_parts.registered_domain
        if domain not in STATE_BROADCASTERS:
            return False, article
        country = STATE_BROADCASTERS[domain]

        passed_filters, article = super().filter_record(warc_record, article)

        if not passed_filters:
            return False, article
        if article is None:
            article = NewsPlease.from_warc(warc_record)
        article.country = country
        if not article.language or not is_european_langcode(article.language):
            return False, article
        searcher = get_covid_searchers().get(article.language)
        if searcher is None:
            return False, article

        def match(key):
            return searcher.match((getattr(article, key)
                                   or "").lower().encode("utf-8"))

        if not match("title") and not match("maintext"):
            return False, article
        return True, article
示例#25
0
def article_generator_text(keyword_query, num_articles):
    text = ''
    for url in search_news(str(keyword_query), num=1, stop=num_articles):
        article = NewsPlease.from_url(str(url))
        if (article.text != None):
            if article.source_domain not in new_list:
                text = text + article.text
    return text
示例#26
0
    def crawl_page(self, response):
        self.crawl_other_links(response)

        article = NewsPlease.from_html(response.content, url=response.url)
        data = article.get_dict()
        data.pop('maintext')

        yield data
示例#27
0
def run_newsplease(htmlstring):
    '''try with newsplease'''
    try:
        article = NewsPlease.from_html(htmlstring, url=None)
        return article.maintext  # sanitize(article.maintext)
    except Exception as err:
        #print('Newsplease exception:', err)
        return ''
示例#28
0
def download_url(url: str) -> None:
    try:
        article = NewsPlease.from_url(urls[url], timeout=10)
        save_obj(article, str(url).zfill(5), text_output_folder)
        return 1
    except Exception as ex:
        print(url, ex)
        return 0
示例#29
0
def extract_article(url):
    article = NewsPlease.from_url(url)
    date = article.date_publish
    author, text = extract_author(article.text)
    keywords = extract_keywords(url)

    metadata = author, date, keywords

    return text, metadata
示例#30
0
 def filter_record(self, warc_record, article=None):
     passed_filters, article = super().filter_record(warc_record, article)
     url = warc_record.rec_headers.get_header('WARC-Target-URI')
     canon_url = canonicalize_url(url)
     if canon_url not in all_urls:
         return False, article
     if article is None:
         article = NewsPlease.from_warc(warc_record)
     return True, article
示例#31
0
    'Equifax breach':
        ["https://www.wsj.com/articles/equifax-earnings-drop-27-in-quarter-marred-by-cyberattack-1510268187",
         "https://www.bloomberg.com/news/articles/2017-11-14/how-much-will-equifax-pay",
         "https://gizmodo.com/equifax-seized-138-scammy-lookalike-domains-instead-of-1820450580"]

}

for index, topic in enumerate(urls):
    for url in urls[topic]:

        dId = hashlib.sha224(url.encode('utf-8')).hexdigest()

        if not json_exist('data_raw', dId):
            # this is an object
            try:
                article = NewsPlease.from_url(url)

                # this is an dict
                article_dict = article.get_dict()

                # cluster with label and id
                article_dict['category_id'] = index
                article_dict['category'] = topic

                # enhancement for giveme5w
                article_dict['dId'] = dId

                # datetime-datetime-not-json-serializable bugfix"
                article_dict['date_publish'] = article_dict['date_publish'].isoformat()

                write_json('data_raw', article_dict['dId'], article_dict)
                    for sentence in sentences:
                        if str(sentence["sentence_id"]) == sent1_id:
                            sent1 = sentence["sentence"]
                        if str(sentence["sentence_id"]) == sent2_id:
                            sent2 = sentence["sentence"]
                    print sent1_id, ":", sent1
                    print sent2_id, ":", sent2
                    print events[0]['source'], events[0]['target'], events[0]['code']
                    val = int(round(10*similar(sent1, sent2)))
                    if val not in similar_count:
                        similar_count[val] = 0
                    similar_count[val] = similar_count[val] + 1

                    from newsplease import NewsPlease

                    article = NewsPlease.from_url(events[0]['url'])
                    print events[0]['url']
                    print(article.text)

        doc_count += 1

print doc_count
print root_code_match
print event_match

print similar_count