def testStoriesSince(self): story_id = 88848861 mc = MediaCloud(None, None, True) stories = mc.storiesSince(story_id) self.assertEquals(len(stories), 15) for story in stories: self.assertTrue(int(story["stories_id"]) > story_id)
def tag_set_with_tags(mc_api_key, tag_sets_id, only_public_tags=False, use_file_cache=False): # don't need to cache here, because either you are reading from a file, or each page is cached local_mc = MediaCloud(mc_api_key) if use_file_cache: file_name = "tags_in_{}.json".format(tag_sets_id) file_path = os.path.join(static_tag_set_cache_dir, file_name) if os.path.isfile(file_path): return cached_tag_set_file(file_path) # more caching! tag_set = local_mc.tagSet(tag_sets_id) # page through tags more_tags = True all_tags = [] last_tags_id = 0 while more_tags: tags = _cached_tag_page(tag_set['tag_sets_id'], last_tags_id, 100, only_public_tags) all_tags = all_tags + tags if len(tags) > 0: last_tags_id = tags[-1]['tags_id'] more_tags = len(tags) != 0 # double check the show_on_media because that controls public or not tag_list = [t for t in all_tags if (only_public_tags is False) or (t['show_on_media'] is 1 or t['show_on_media'] is True)] # sort by label (or tag if no label exists) for t in tag_list: t['sort_key'] = t['label'].lower() if t['label'] else t['tag'].lower() tag_list = sorted(tag_list, key=itemgetter('sort_key')) for t in tag_list: del t['sort_key'] tag_set['tags'] = tag_list tag_set['name'] = tag_set['label'] return tag_set
def testStoriesSinceForReal(self): mc = MediaCloud(self._config.get("api", "user"), self._config.get("api", "pass")) story_id = 88848861 stories = mc.storiesSince(story_id) self.assertEquals(len(stories), mc.DEFAULT_STORY_COUNT) for story in stories: self.assertTrue(int(story["stories_id"]) > story_id) self.assertTrue(int(story["stories_id"]) - story_id <= mc.DEFAULT_STORY_COUNT)
def testIsEnglish(self): mc = MediaCloud(None,None,True) english_story_id = 88848861 story = mc.storyDetail(english_story_id) is_english = mediacloud.examples.isEnglish(story['story_text']) self.assertTrue(is_english) # TODO: find a real example in the MC curpus story['story_text'] = "Esto es un otro cuenta en espanol" is_english = mediacloud.examples.isEnglish(story['story_text']) self.assertFalse(is_english)
def testFleshKincaidGradeLevel(self): story_id = 88848861 mc = MediaCloud(None,None,True) story = mc.storyDetail(story_id) fkLevel = mediacloud.examples.getFleshKincaidGradeLevel(story['story_text']) self.assertEquals(round(fkLevel), 8) fkLevel = mediacloud.examples.getFleshKincaidGradeLevel("") self.assertTrue( fkLevel==None ) fkLevel = mediacloud.examples.getFleshKincaidGradeLevel(None) self.assertTrue( fkLevel==None )
def testRecentStoriesForReal(self): mc = MediaCloud(self._config.get("api", "user"), self._config.get("api", "pass")) # test basic fetch stories = mc.recentStories() self.assertEquals(len(stories), mc.DEFAULT_STORY_COUNT) # test story limit stories = mc.recentStories(10) self.assertEquals(len(stories), 10) for story in stories: self.assertFalse(story.has_key("first_raw_download_file")) # test raw download option stories = mc.recentStories(10, True) for story in stories: self.assertTrue(story.has_key("first_raw_download_file"))
def month_sample_worker(job): q = job['q'] month = job['month'] page_size = job['page_size'] start_date = datetime.date(2019, month, 1) days_in_month = calendar.monthrange(start_date.year, start_date.month)[1] end_date = start_date + datetime.timedelta(days=days_in_month - 1) fq = MediaCloud.publish_date_query(start_date, end_date) stories = cached_story_page(q, fq, page_size) return stories
def tag_set_with_tags(mc_api_key, tag_sets_id, only_public_tags=False, use_file_cache=False): # don't need to cache here, because either you are reading from a file, or each page is cached local_mc = MediaCloud(mc_api_key) if use_file_cache: file_name = "tags_in_{}.json".format(tag_sets_id) file_path = os.path.join(static_tag_set_cache_dir, file_name) if os.path.isfile(file_path): return cached_tag_set_file(file_path) # more caching! tag_set = local_mc.tagSet(tag_sets_id) # page through tags more_tags = True all_tags = [] last_tags_id = 0 while more_tags: tags = _cached_tag_page(mc_api_key, tag_set['tag_sets_id'], last_tags_id, 100, only_public_tags) all_tags = all_tags + tags if len(tags) > 0: last_tags_id = tags[-1]['tags_id'] more_tags = len(tags) != 0 # double check the show_on_media because that controls public or not tag_list = [ t for t in all_tags if (only_public_tags is False) or ( t['show_on_media'] == 1 or t['show_on_media'] is True) ] # sort by label (or tag if no label exists) for t in tag_list: t['sort_key'] = t['label'].lower() if t['label'] else t['tag'].lower() tag_list = sorted(tag_list, key=itemgetter('sort_key')) for t in tag_list: del t['sort_key'] tag_set['tags'] = tag_list tag_set['name'] = tag_set['label'] return tag_set
def _as_query_and_filter_query(cls, query: str, start_date: dt.datetime, end_date: dt.datetime, **kwargs) -> (str, str): """ Take all the query params and return q and fq suitable for a media cloud solr-syntax query :param query: :param start_date: :param end_date: :param kwargs: sources and collections :return: """ media_ids = kwargs['sources'] if 'sources' in kwargs else [] tags_ids = kwargs['collections'] if 'collections' in kwargs else [] q = concatenate_query_for_solr(query, media_ids, tags_ids) fq = MediaCloud.dates_as_query_clause(start_date, end_date) return q, fq
To Install: >>> import nltk >>> nltk.download() [ select d for Download ] [ enter "stopwords" as the identifier ] [ enter "punkt" as the identifier ] ''' config = ConfigParser.ConfigParser() config.read('mc-client.config') # set up a connection to a local DB db = StoryDatabase('mediacloud', config.get('db','host'), config.get('db','port') ) # connect to MC and fetch some articles mc = MediaCloud( config.get('api','user'), config.get('api','pass') ) results = mc.recentStories() print "Fetched "+str(len(results))+" stories" # set up my callback function that adds readability score to the story pub.subscribe(mediacloud.examples.addFleshKincaidGradeLevelToStory, StoryDatabase.EVENT_PRE_STORY_SAVE) # save all the stories in the db (this will fire the callback above) saved = 0 for story in results: worked = db.addStory(story) if worked: saved = saved + 1 print "Saved "+str(saved)+" stories"
STORIES_TO_FETCH = 100 config = ConfigParser.ConfigParser() config.read('mc-client.config') # setup logging logging.basicConfig(filename='mc-realtime.log',level=logging.DEBUG) log = logging.getLogger('mc-realtime') log.info("---------------------------------------------------------------------------") # setup a connection to a local DB db = StoryDatabase('mediacloud', config.get('db','host'), config.get('db','port') ) # setup the mediacloud connection mc = MediaCloud( config.get('api','user'), config.get('api','pass') ) max_story_id = db.getMaxStoryId() results = mc.storiesSince( max_story_id, STORIES_TO_FETCH ) log.info("Fetched "+str(len(results))+" stories (after "+str(max_story_id)+")") # set up my callback function that adds word count to the story pub.subscribe(mediacloud.examples.addWordCountToStory, StoryDatabase.EVENT_PRE_STORY_SAVE) # set up my callback function that adds the language guess to the story pub.subscribe(mediacloud.examples.addIsEnglishToStory, StoryDatabase.EVENT_PRE_STORY_SAVE) # set up my callback function that adds the reading grade level to the story pub.subscribe(mediacloud.examples.addFleshKincaidGradeLevelToStory, StoryDatabase.EVENT_PRE_STORY_SAVE) # save all the stories in the db
# setup logging logging.basicConfig(filename='mc-realtime.log',level=logging.DEBUG) log = logging.getLogger('mc-realtime') log.info("---------------------------------------------------------------------------") # setup a connection to a local DB of articles #db = StoryDatabase('mediacloud', config.get('db','host'), config.get('db','port') ) articles_db = StoryDatabase('articles', config.get('db','host'), config.get('db','port') ) # setup a connection to a local DB of twitter accounts #accounts_db = StoryDatabase('accounts', config.get('db','host'), config.get('db','port') ) server = couchdb.Server() accounts_db = server['accounts'] # setup the mediacloud connection mc = MediaCloud( config.get('api','user'), config.get('api','pass') ) # Must first seed database with latest Story ID, or else it will start at the beginning (2005) max_story_id = articles_db.getMaxStoryId() results = mc.storiesSince( max_story_id, STORIES_TO_FETCH, fetch_raw_text = True ) log.info("Fetched "+str(len(results))+" stories (after "+str(max_story_id)+")") # set up a callback function that adds twitter username occurrences to the story pub.subscribe(mediacloud.examples.addTwitterReferencesToStory, StoryDatabase.EVENT_PRE_STORY_SAVE) # save all the stories in the db saved = 0 for story in results: print 'new story', worked = articles_db.addStory(story) if worked:
def testTwitterReferences(self): story_id = 88848862 mc = MediaCloud(None, None, True) story = mc.storyDetail(story_id) twitter_references = mediacloud.examples.twitter_references(story['first_raw_download_file']) self.assertEquals(twitter_references, ['natematias', 'thornet', 'lucyfedia', 'okfn'])
# setup a connection to the DB try: db = MongoStoryDatabase(config.get('db','name'),config.get('db','host'),int(config.get('db','port'))) except pymongo.errors.ConnectionFailure, e: log.error(e) sys.exit() log.info("Connected to "+config.get('db','name')+" on "+config.get('db','host')+":"+str(config.get('db','port'))) # setup a connection to the geocoder worked = clavin.connect() if not worked: sys.exit() # setup the mediacloud connection mc = MediaCloud( config.get('api','user'), config.get('api','pass') ) # set up my callback function that adds the reading grade level to the story pub.subscribe(mcgeo.algorithms.addLocationsToStory, StoryDatabase.EVENT_PRE_STORY_SAVE) # save all the stories in the db (this will fire the callback above) saved = 0 first_page = int(config.get('api','first_page'))+1 for page in xrange(MAX_PAGES_TO_FETCH): query_page = first_page+(page+1) results = mc.allProcessed(query_page) log.info("Fetched "+str(len(results))+" stories (page "+str(query_page)+")") for story in results: worked = db.addStory(story) if worked: saved = saved + 1
def testWordCount(self): story_id = 88848861 mc = MediaCloud(None,None,True) story = mc.storyDetail(story_id) word_count = mediacloud.examples.getWordCount(story['description']) self.assertEquals(word_count, 10436)
def testStoryDetailForReal(self): mc = MediaCloud(self._config.get("api", "user"), self._config.get("api", "pass")) story_id = 88848861 story = mc.storyDetail(story_id) self.assertEquals(story["stories_id"], story_id)
def testStoryDetail(self): story_id = 88848861 mc = MediaCloud(None, None, True) story = mc.storyDetail(story_id) self.assertEquals(story["stories_id"], story_id)
def testRecentStories(self): mc = MediaCloud(None, None, True) stories = mc.recentStories() self.assertEquals(len(stories), 30)
def get_mc_client(): return MediaCloud(MC_API_KEY)