def test_get_article_attri(): print '\nfunction: %s ' % inspect.stack()[0][3] my_mongo = MyMongo(dbname='nytimes') # print my_mongo.get_one_article() article_atrri, article_dt = my_mongo.get_article_attri(testing=1) print '%i items retrieved' % (len(article_atrri)) print 'title:' print zip(article_atrri)[:5] print 'publication date' print zip(article_dt)[:5] my_mongo.close() n.assert_greater(len(article_atrri), 2)
def read_articles(): ''' read all articles as dataframe from mongodb collection 'articles' - INPUT: None - OUTPUT: df. columns: title, url, uri, body_text, ''' my_mongo = MyMongo() t0 = time.time() cur_articles = my_mongo.get_article_body_text(testing=0) articles_cleaned = {} # print '%d unique articles ' % len(articles_cleaned) clean_articles(cur_articles, articles_cleaned) print '%d unique articles with body_text' % len(articles_cleaned) t1 = time.time() # time it print "finished in %4.4fmin for %s " % ((t1 - t0) / 60, 'read/clean articles') df = pd.DataFrame([{'url': k, 'body_text': v[1]} for k, v in articles_cleaned.items()]) article_dict, article_dt = MyMongo().get_article_attri() #article_dict_all = dict(article_dict) df['title'] = df['url'].map(lambda x: article_dict.get(x, 'Unknown')) df['uri'] = df['url'].map(lambda x: parse_url(x).host) df['dt'] = df['url'].map(lambda x: article_dt.get(x, '')) my_mongo.close() return df
import sys # sys.path.append('../db') from configobj import ConfigObj config = ConfigObj('../allds.config') allds_home = config['allDS_home'] sys.path.append(allds_home + '/code/db') from my_mongo import MyMongo from httplib import BadStatusLine from boilerpipe.extract import Extractor if __name__ == '__main__': my_mongo = MyMongo() query = {'raw_html': {'$exists': 1}, 'body_text': {'$exists': 0}} cur_articles = my_mongo.get_articles(query=query) articles = list(cur_articles) print '%d articles to be processed. ' % (len(articles)) for a in articles: try: extractor = Extractor( extractor='ArticleExtractor', html=a['raw_html']) extracted_text = extractor.getText() l = extracted_text.split('\n') a_id = a['_id'] my_mongo.update_record( 'articles', a_id, 'body_text', extracted_text) # print(extracted_text)