def parallel_arct(arg):
    arcticle = arg
    local_id = 0
    with lock:
        local_id = id.value
        id.value += 1
    try:
        newarticle = NewsArticle(local_id, arcticle[0], arcticle[1], arcticle[2], arcticle[3], arcticle[4], countries)
        newarticle.extract_metadata()
    except Exception:
        print "Constructor"
    
    return newarticle
Exemplo n.º 2
0
def get_article(art):
    art.download()
    if art.is_downloaded:
        art.parse()
        if art.is_parsed:
            art.nlp()
            na = NewsArticle(art)
            p_uri = urlparse(art.source_url)
            p_domain = '{uri.netloc}'.format(uri=p_uri)
            loop.run_until_complete(
                do_insert(db[p_domain], na.GetMongoDocument()))
        else:
            print(',', end="", flush=True)
    else:
        print('*', end="", flush=True)
Exemplo n.º 3
0
def retrieve_homepage_articles(soup):
    homepage_headline_class_tag = "storylink"

    headlines = soup.find_all("a", {"class": homepage_headline_class_tag})

    articles = list()

    for result in headlines:
        news_article = NewsArticle()

        news_article.title = result.string.strip()
        news_article.url = result['href']
        news_article.source = "HackerNews"
        articles.append(news_article)
    return articles
Exemplo n.º 4
0
def retrieve_homepage_articles(soup):
    homepage_headline_class_tag = "block-link__overlay-link"

    headlines = soup.find_all("a", {"class": homepage_headline_class_tag})

    articles = list()

    for i in range(20):
        result = headlines[i]
        news_article = NewsArticle()

        news_article.title = result.string.strip()
        news_article.url = result['href']
        news_article.source = "BBC"
        articles.append(news_article)
    return articles
Exemplo n.º 5
0
def get_search_result(soup):
    search_result_class_tag = "search-results"
    headlines_class_tag = "headline"
    footer_date_tag = "flags btm"
    date_class_tag = "display-date"

    search_results = soup.find("ol", {
        "class": search_result_class_tag
    }).find_all("li")

    articles = list()

    for result in search_results:
        news_article = NewsArticle()

        result = result.find("div")
        result_headline = result.find("h1", {"itemprop": headlines_class_tag})
        # date under tags: footer -> dl -> dd -> time
        result_date = result.find("footer").find("dl", {
            "class": footer_date_tag
        }).find("dd").find("time", {"class": date_class_tag})

        news_article.title = result_headline.find("a").string.strip()
        news_article.url = result_headline.find("a")['href']
        #TODO: put date in correct format
        news_article.date = result_date.string.strip()
        news_article.source = "BBC"

        articles.append(news_article)

    return articles
Exemplo n.º 6
0
def get_homepage_articles():
    headlines = list()
    subreddit = "all"
    url = "https://www.reddit.com/r/" + subreddit + "/hot.json"
    json_str = requests.get(url, headers = {'User-agent': 'your bot 0.2'}).text
    data = json.loads(json_str)
    for i in data["data"]["children"]:
        the_data = i["data"]
        if "title" in the_data.keys():
            new_article = NewsArticle(title=the_data["title"], source='Reddit')
            headlines.append(new_article)

    return headlines
Exemplo n.º 7
0
def get_headlines(num_headlines=None, browser=None):
    url = "https://www.cnn.com/"
    soup = get_url_soup(url, browser=browser)
    counter = 0
    headlines = list()
    urls = list()
    for h3_soup in soup.find_all("h3", {"class": "cd__headline"}):
        counter += 1
        headline = h3_soup.find("span", {
            "class": "cd__headline-text"
        }).get_text()

        article = NewsArticle()
        article.title = headline
        print(article.title)
        headlines.append(article)
        url = h3_soup.find("a")["href"]
        if "https://www.cnn.com" not in url:
            url = "https://www.cnn.com" + url
        urls.append(url)
        if num_headlines is not None and counter >= num_headlines:
            break

    return headlines
Exemplo n.º 8
0
def get_headlines(num_headlines=None):
    url = "https://www.cnet.com/"
    soup = get_url_soup(url)
    #print(soup.prettify())
    counter = 0
    headlines = list()
    for headline_soup in soup.find_all(
            "a", {"section": lambda x: x is not None and "pebble" in x}):
        h3_soup = headline_soup.find("h3")
        if h3_soup is None:
            continue
        counter += 1
        title = (h3_soup.get_text().strip())
        headlines.append(NewsArticle(aTitle=title))
        if num_headlines is not None and counter >= num_headlines:
            break

    return headlines
Exemplo n.º 9
0
 def convert_to_class(item):
     news_article = NewsArticle()
     news_article.authors = item['authors']
     news_article.date_download = ExtractedInformationStorage.datestring_to_date(
         item['date_download'])
     news_article.date_modify = ExtractedInformationStorage.datestring_to_date(
         item['date_modify'])
     news_article.date_publish = ExtractedInformationStorage.datestring_to_date(
         item['date_publish'])
     news_article.description = item['description']
     news_article.filename = item['filename']
     news_article.image_url = item['image_url']
     news_article.language = item['language']
     news_article.localpath = item['localpath']
     news_article.title = item['title']
     news_article.title_page = item['title_page']
     news_article.title_rss = item['title_rss']
     news_article.source_domain = item['source_domain']
     news_article.text = item['text']
     news_article.url = item['url']
     return news_article
Exemplo n.º 10
0
	def put_article_in_db(self):
		counter=0
		try:
			for x in self.sublist:
				submissions=self.reddit.get_subreddit(x).get_hot(limit=30)
				for submission in submissions:
					story_url=submission.url.encode('ascii', 'ignore')
					if( not self.db.in_set({'url':story_url})):
						print str(story_url)
						current_article = NewsArticle(story_url)
						

						
						#publish date for article : datetime object 
						article_published = current_article.date_made()
						
						

						#title of article : String
						article_title=current_article.get_title()
						#print article_title


					
						current_article.goodArticle()
						#keywords in article: Array of Strings
						article_key_words = current_article.getKeywords()
						
						#videos in story : Array of Strings (url to videos)
						article_videos = current_article.get_videos()


						#summary of article : String
						article_summary = current_article.getSummary()
						
						#authors of article: Array of Strings
						article_authors = current_article.getAuthors()
						
						#image for article : String (url to image)
						article_thumbnaillink = current_article.thumbnail_url()

						
						mydb = pymongo.MongoClient()
						res=get_tld(story_url, as_object=True)
						new_entry = {}
						new_entry['title']=article_title
						new_entry['sum']=article_summary
						new_entry['author']=article_authors
						new_entry['thumb'] = article_thumbnaillink
						new_entry['pub'] = article_published
						new_entry['keywords'] = article_key_words
						new_entry['vids']  = article_videos
						new_entry['likes']=0
						new_entry['dislikes']=0
						new_entry['comments'] = []
						new_entry['url'] = story_url
						new_entry['_id'] = uuid.uuid4().hex
						new_entry['postnum']=mydb.lyket.articles.count()
						new_entry['creationtime']=datetime.datetime.now()
						new_entry['publisher'] = res.domain

						new_entry['creationtime']=datetime.datetime.utcnow()
						new_entry['companycreator'] = res.domain

						self.db.CollectionSubmitOne(new_entry)
						print "Done with article " + str(mydb.lyket.articles.count())
					else:
						print "Already have it " + str(counter)
						counter=counter+1

		except Exception as e:
			print "------"
			print "its f****d emma"
			print e
			print "------"
Exemplo n.º 11
0
progressBar = ProgressBar(int(len(xmlfiles)))
supportBar = SupportBar()

#create file for results
results = open('../output/results.txt', 'w+')
debug = open('../output/debug.txt', 'w+')

id = -1
for filename in xmlfiles:
    larct = parse("../filesXML/" + filename)
    sys.stdout.write("(" + str(len(larct)) + "/" )
    sys.stdout.flush()
    for arcticle in larct:
        id += 1
        try:
            newarticle = NewsArticle(id, arcticle[0], arcticle[1], arcticle[2], arcticle[3], arcticle[4])
            newarticle.extract_metadata()

            aggr.add_article(newarticle)
            
            #Update StatusBar
            supportBar.increase()
            size = len(str(supportBar.get()))
            spaces = ' ' * (4 - size)
            sys.stdout.write("{0}){1}\b\b\b\b\b".format(supportBar.get(), spaces))
            sys.stdout.flush()
            
        except KeyboardInterrupt:
            print "\nProgram Closed Successfully!"
            sys.exit(1)
        except Exception,e:
Exemplo n.º 12
0
	def put_article_in_db(self,story_url):
		try:
			if( not self.db.in_set({'url':story_url})):
				current_article = NewsArticle(story_url)
				

				
				#publish date for article : datetime object 
				article_published = current_article.date_made()
				
				

				#title of article : String
				article_title=current_article.get_title()
				#print article_title


			
				current_article.goodArticle()
				#keywords in article: Array of Strings
				article_key_words = current_article.getKeywords()
				
				#videos in story : Array of Strings (url to videos)
				article_videos = current_article.get_videos()


				#summary of article : String
				article_summary = current_article.getSummary()
				
				#authors of article: Array of Strings
				article_authors = current_article.getAuthors()
				
				#image for article : String (url to image)
				article_thumbnaillink = current_article.thumbnail_url()

				article_url = current_article.get_url()

				res=get_tld(article_url, as_object=True)
				new_entry = {}
				new_entry['title']=article_title
				new_entry['sum']=article_summary
				new_entry['auth']=article_authors
				new_entry['thumb'] = article_thumbnaillink
				new_entry['pub'] = article_published
				new_entry['keywords'] = article_key_words
				new_entry['vids']  = article_videos
				new_entry['likes']=0
				new_entry['dislikes']=0
				new_entry['comments'] = []
				new_entry['url'] = article_url

				new_entry['creationtime']=datetime.datetime.now()
				new_entry['publisher'] = res.domain

				new_entry['creationtime']=datetime.datetime.utcnow()
				new_entry['companycreator'] = res.domain

				self.db.CollectionSubmitOne(new_entry)

		except Exception as e:
			print "------"
			print "its f****d emma"
			print e
			print "------"
Exemplo n.º 13
0
    def put_article_in_db(self, story_url):
        try:
            if (not self.db.in_set({'url': story_url})):
                current_article = NewsArticle(story_url)

                #publish date for article : datetime object
                article_published = current_article.date_made()

                #title of article : String
                article_title = current_article.get_title()
                #print article_title

                current_article.goodArticle()
                #keywords in article: Array of Strings
                article_key_words = current_article.getKeywords()

                #videos in story : Array of Strings (url to videos)
                article_videos = current_article.get_videos()

                #summary of article : String
                article_summary = current_article.getSummary()

                #authors of article: Array of Strings
                article_authors = current_article.getAuthors()

                #image for article : String (url to image)
                article_thumbnaillink = current_article.thumbnail_url()

                article_url = current_article.get_url()

                res = get_tld(article_url, as_object=True)
                new_entry = {}
                new_entry['title'] = article_title
                new_entry['sum'] = article_summary
                new_entry['auth'] = article_authors
                new_entry['thumb'] = article_thumbnaillink
                new_entry['pub'] = article_published
                new_entry['keywords'] = article_key_words
                new_entry['vids'] = article_videos
                new_entry['likes'] = 0
                new_entry['dislikes'] = 0
                new_entry['comments'] = []
                new_entry['url'] = article_url

                new_entry['creationtime'] = datetime.datetime.now()
                new_entry['publisher'] = res.domain

                new_entry['creationtime'] = datetime.datetime.utcnow()
                new_entry['companycreator'] = res.domain

                self.db.CollectionSubmitOne(new_entry)

        except Exception as e:
            print "------"
            print "its f****d emma"
            print e
            print "------"
Exemplo n.º 14
0
 def convert_to_class(item):
     news_article = NewsArticle()
     news_article.authors = item['authors']
     news_article.date_download = ExtractedInformationStorage.datestring_to_date(item['date_download'])
     news_article.date_modify = ExtractedInformationStorage.datestring_to_date(item['date_modify'])
     news_article.date_publish = ExtractedInformationStorage.datestring_to_date(item['date_publish'])
     news_article.description = item['description']
     news_article.filename = item['filename']
     news_article.image_url = item['image_url']
     news_article.language = item['language']
     news_article.localpath = item['localpath']
     news_article.title = item['title']
     news_article.title_page = item['title_page']
     news_article.title_rss = item['title_rss']
     news_article.source_domain = item['source_domain']
     news_article.text = item['text']
     news_article.url = item['url']
     return news_article