示例#1
0
 def testStoriesSince(self):
     story_id = 88848861
     mc = MediaCloud(None, None, True)
     stories = mc.storiesSince(story_id)
     self.assertEquals(len(stories), 15)
     for story in stories:
         self.assertTrue(int(story["stories_id"]) > story_id)
示例#2
0
def tag_set_with_tags(mc_api_key, tag_sets_id, only_public_tags=False, use_file_cache=False):
    # don't need to cache here, because either you are reading from a file, or each page is cached
    local_mc = MediaCloud(mc_api_key)
    if use_file_cache:
        file_name = "tags_in_{}.json".format(tag_sets_id)
        file_path = os.path.join(static_tag_set_cache_dir, file_name)
        if os.path.isfile(file_path):
            return cached_tag_set_file(file_path)   # more caching!
    tag_set = local_mc.tagSet(tag_sets_id)
    # page through tags
    more_tags = True
    all_tags = []
    last_tags_id = 0
    while more_tags:
        tags = _cached_tag_page(tag_set['tag_sets_id'], last_tags_id, 100, only_public_tags)
        all_tags = all_tags + tags
        if len(tags) > 0:
            last_tags_id = tags[-1]['tags_id']
        more_tags = len(tags) != 0
    # double check the show_on_media because that controls public or not
    tag_list = [t for t in all_tags if (only_public_tags is False) or
                (t['show_on_media'] is 1 or t['show_on_media'] is True)]
    # sort by label (or tag if no label exists)
    for t in tag_list:
        t['sort_key'] = t['label'].lower() if t['label'] else t['tag'].lower()
    tag_list = sorted(tag_list, key=itemgetter('sort_key'))
    for t in tag_list:
        del t['sort_key']
    tag_set['tags'] = tag_list
    tag_set['name'] = tag_set['label']
    return tag_set
示例#3
0
 def testStoriesSinceForReal(self):
     mc = MediaCloud(self._config.get("api", "user"), self._config.get("api", "pass"))
     story_id = 88848861
     stories = mc.storiesSince(story_id)
     self.assertEquals(len(stories), mc.DEFAULT_STORY_COUNT)
     for story in stories:
         self.assertTrue(int(story["stories_id"]) > story_id)
         self.assertTrue(int(story["stories_id"]) - story_id <= mc.DEFAULT_STORY_COUNT)
 def testIsEnglish(self):
     mc = MediaCloud(None,None,True)
     english_story_id = 88848861
     story = mc.storyDetail(english_story_id)
     is_english = mediacloud.examples.isEnglish(story['story_text'])
     self.assertTrue(is_english)
     # TODO: find a real example in the MC curpus
     story['story_text'] = "Esto es un otro cuenta en espanol" 
     is_english = mediacloud.examples.isEnglish(story['story_text'])
     self.assertFalse(is_english)
 def testFleshKincaidGradeLevel(self):
     story_id = 88848861
     mc = MediaCloud(None,None,True)
     story = mc.storyDetail(story_id)
     fkLevel = mediacloud.examples.getFleshKincaidGradeLevel(story['story_text'])
     self.assertEquals(round(fkLevel), 8)
     fkLevel = mediacloud.examples.getFleshKincaidGradeLevel("")
     self.assertTrue( fkLevel==None )
     fkLevel = mediacloud.examples.getFleshKincaidGradeLevel(None)
     self.assertTrue( fkLevel==None )
示例#6
0
 def testRecentStoriesForReal(self):
     mc = MediaCloud(self._config.get("api", "user"), self._config.get("api", "pass"))
     # test basic fetch
     stories = mc.recentStories()
     self.assertEquals(len(stories), mc.DEFAULT_STORY_COUNT)
     # test story limit
     stories = mc.recentStories(10)
     self.assertEquals(len(stories), 10)
     for story in stories:
         self.assertFalse(story.has_key("first_raw_download_file"))
     # test raw download option
     stories = mc.recentStories(10, True)
     for story in stories:
         self.assertTrue(story.has_key("first_raw_download_file"))
def month_sample_worker(job):
    q = job['q']
    month = job['month']
    page_size = job['page_size']
    start_date = datetime.date(2019, month, 1)
    days_in_month = calendar.monthrange(start_date.year, start_date.month)[1]
    end_date = start_date + datetime.timedelta(days=days_in_month - 1)
    fq = MediaCloud.publish_date_query(start_date, end_date)
    stories = cached_story_page(q, fq, page_size)
    return stories
示例#8
0
def tag_set_with_tags(mc_api_key,
                      tag_sets_id,
                      only_public_tags=False,
                      use_file_cache=False):
    # don't need to cache here, because either you are reading from a file, or each page is cached
    local_mc = MediaCloud(mc_api_key)
    if use_file_cache:
        file_name = "tags_in_{}.json".format(tag_sets_id)
        file_path = os.path.join(static_tag_set_cache_dir, file_name)
        if os.path.isfile(file_path):
            return cached_tag_set_file(file_path)  # more caching!
    tag_set = local_mc.tagSet(tag_sets_id)
    # page through tags
    more_tags = True
    all_tags = []
    last_tags_id = 0
    while more_tags:
        tags = _cached_tag_page(mc_api_key, tag_set['tag_sets_id'],
                                last_tags_id, 100, only_public_tags)
        all_tags = all_tags + tags
        if len(tags) > 0:
            last_tags_id = tags[-1]['tags_id']
        more_tags = len(tags) != 0
    # double check the show_on_media because that controls public or not
    tag_list = [
        t for t in all_tags if (only_public_tags is False) or (
            t['show_on_media'] == 1 or t['show_on_media'] is True)
    ]
    # sort by label (or tag if no label exists)
    for t in tag_list:
        t['sort_key'] = t['label'].lower() if t['label'] else t['tag'].lower()
    tag_list = sorted(tag_list, key=itemgetter('sort_key'))
    for t in tag_list:
        del t['sort_key']
    tag_set['tags'] = tag_list
    tag_set['name'] = tag_set['label']
    return tag_set
示例#9
0
 def _as_query_and_filter_query(cls, query: str, start_date: dt.datetime,
                                end_date: dt.datetime,
                                **kwargs) -> (str, str):
     """
     Take all the query params and return q and fq suitable for a media cloud solr-syntax query
     :param query:
     :param start_date:
     :param end_date:
     :param kwargs: sources and collections
     :return:
     """
     media_ids = kwargs['sources'] if 'sources' in kwargs else []
     tags_ids = kwargs['collections'] if 'collections' in kwargs else []
     q = concatenate_query_for_solr(query, media_ids, tags_ids)
     fq = MediaCloud.dates_as_query_clause(start_date, end_date)
     return q, fq
To Install:
>>> import nltk
>>> nltk.download()
[ select d for Download ]
[ enter "stopwords" as the identifier ]
[ enter "punkt" as the identifier ]
'''

config = ConfigParser.ConfigParser()
config.read('mc-client.config')

# set up a connection to a local DB
db = StoryDatabase('mediacloud', config.get('db','host'), config.get('db','port') )

# connect to MC and fetch some articles
mc = MediaCloud( config.get('api','user'), config.get('api','pass') )
results = mc.recentStories()
print "Fetched "+str(len(results))+" stories"

# set up my callback function that adds readability score to the story
pub.subscribe(mediacloud.examples.addFleshKincaidGradeLevelToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# save all the stories in the db (this will fire the callback above)
saved = 0
for story in results:
    worked = db.addStory(story)
    if worked:
        saved = saved + 1

print "Saved "+str(saved)+" stories"
STORIES_TO_FETCH = 100

config = ConfigParser.ConfigParser()
config.read('mc-client.config')

# setup logging
logging.basicConfig(filename='mc-realtime.log',level=logging.DEBUG)
log = logging.getLogger('mc-realtime')
log.info("---------------------------------------------------------------------------")

# setup a connection to a local DB
db = StoryDatabase('mediacloud', config.get('db','host'), config.get('db','port') )

# setup the mediacloud connection
mc = MediaCloud( config.get('api','user'), config.get('api','pass') )

max_story_id = db.getMaxStoryId()
results = mc.storiesSince( max_story_id, STORIES_TO_FETCH )
log.info("Fetched "+str(len(results))+" stories (after "+str(max_story_id)+")")

# set up my callback function that adds word count to the story
pub.subscribe(mediacloud.examples.addWordCountToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# set up my callback function that adds the language guess to the story
pub.subscribe(mediacloud.examples.addIsEnglishToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# set up my callback function that adds the reading grade level to the story
pub.subscribe(mediacloud.examples.addFleshKincaidGradeLevelToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# save all the stories in the db
# setup logging
logging.basicConfig(filename='mc-realtime.log',level=logging.DEBUG)
log = logging.getLogger('mc-realtime')
log.info("---------------------------------------------------------------------------")

# setup a connection to a local DB of articles
#db = StoryDatabase('mediacloud', config.get('db','host'), config.get('db','port') )
articles_db = StoryDatabase('articles', config.get('db','host'), config.get('db','port') )

# setup a connection to a local DB of twitter accounts
#accounts_db = StoryDatabase('accounts', config.get('db','host'), config.get('db','port') )
server = couchdb.Server()
accounts_db = server['accounts']

# setup the mediacloud connection
mc = MediaCloud( config.get('api','user'), config.get('api','pass') )

# Must first seed database with latest Story ID, or else it will start at the beginning (2005)
max_story_id = articles_db.getMaxStoryId()
results = mc.storiesSince( max_story_id, STORIES_TO_FETCH, fetch_raw_text = True )
log.info("Fetched "+str(len(results))+" stories (after "+str(max_story_id)+")")

# set up a callback function that adds twitter username occurrences to the story
pub.subscribe(mediacloud.examples.addTwitterReferencesToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# save all the stories in the db
saved = 0
for story in results:
    print 'new story',
    worked = articles_db.addStory(story)
    if worked:
示例#13
0
 def testTwitterReferences(self):
     story_id = 88848862
     mc = MediaCloud(None, None, True)
     story = mc.storyDetail(story_id)
     twitter_references = mediacloud.examples.twitter_references(story['first_raw_download_file'])
     self.assertEquals(twitter_references, ['natematias', 'thornet', 'lucyfedia', 'okfn'])
示例#14
0
# setup a connection to the DB
try:
    db = MongoStoryDatabase(config.get('db','name'),config.get('db','host'),int(config.get('db','port')))
except pymongo.errors.ConnectionFailure, e:
    log.error(e)
    sys.exit()
log.info("Connected to "+config.get('db','name')+" on "+config.get('db','host')+":"+str(config.get('db','port')))

# setup a connection to the geocoder
worked = clavin.connect()
if not worked:
    sys.exit()

# setup the mediacloud connection
mc = MediaCloud( config.get('api','user'), config.get('api','pass') )

# set up my callback function that adds the reading grade level to the story
pub.subscribe(mcgeo.algorithms.addLocationsToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# save all the stories in the db (this will fire the callback above)
saved = 0
first_page = int(config.get('api','first_page'))+1
for page in xrange(MAX_PAGES_TO_FETCH):
    query_page = first_page+(page+1)
    results = mc.allProcessed(query_page)
    log.info("Fetched "+str(len(results))+" stories (page "+str(query_page)+")")
    for story in results:
        worked = db.addStory(story)
        if worked:
            saved = saved + 1
示例#15
0
 def testWordCount(self):
     story_id = 88848861
     mc = MediaCloud(None,None,True)
     story = mc.storyDetail(story_id)
     word_count = mediacloud.examples.getWordCount(story['description'])
     self.assertEquals(word_count, 10436)
示例#16
0
 def testStoryDetailForReal(self):
     mc = MediaCloud(self._config.get("api", "user"), self._config.get("api", "pass"))
     story_id = 88848861
     story = mc.storyDetail(story_id)
     self.assertEquals(story["stories_id"], story_id)
示例#17
0
 def testStoryDetail(self):
     story_id = 88848861
     mc = MediaCloud(None, None, True)
     story = mc.storyDetail(story_id)
     self.assertEquals(story["stories_id"], story_id)
示例#18
0
 def testRecentStories(self):
     mc = MediaCloud(None, None, True)
     stories = mc.recentStories()
     self.assertEquals(len(stories), 30)
示例#19
0
def get_mc_client():
    return MediaCloud(MC_API_KEY)