def setUp(self): original_mongo_uri = settings.MONGODB_URI original_db = original_mongo_uri.split("/")[-1] self.test_db = "test" + original_db test_uri = "/".join( original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db self.testing_storage = Storage(uri=test_uri)
class MrClean(Collector): """ delete articles older than 1 week """ def __init__(self): super(MrClean, self).__init__() self.storage = Storage() def run(self, **kwargs): articles = self.storage.get_articles() week_before = datetime.date.today() - datetime.timedelta(7) # 1 week delta removed_articles = [] for article in articles: for ref_date in article.ref_dates: year = ref_date['date'][0] month = ref_date['date'][1] or 12 day = ref_date['date'][2] or calendar.monthrange(year, month)[1] r_date = datetime.date(year, month, day) if r_date >= week_before: break else: removed_articles.append(article.__dict__) self.storage.remove_article(article._id) self.set_report( status = "done", count = len(removed_articles), removed_articles = removed_articles) self.storage.save_report(self.get_report())
class MrClean(Collector): """ delete articles older than 1 week """ def __init__(self): super(MrClean, self).__init__() self.storage = Storage() def run(self, **kwargs): articles = self.storage.get_articles() week_before = datetime.date.today() - datetime.timedelta( 7) # 1 week delta removed_articles = [] for article in articles: for ref_date in article.ref_dates: year = ref_date['date'][0] month = ref_date['date'][1] or 12 day = ref_date['date'][2] or calendar.monthrange(year, month)[1] r_date = datetime.date(year, month, day) if r_date >= week_before: break else: removed_articles.append(article.__dict__) self.storage.remove_article(article._id) self.set_report(status="done", count=len(removed_articles), removed_articles=removed_articles) self.storage.save_report(self.get_report())
def __init__(self, channels, year, month=None, day=None, report_extra={}, use_storage=False, force_collect=False): """ force_collect : if use_storage is enable, force the collect even if there is already a report for this searched date """ super(CollectArticles, self).__init__() self.use_storage = use_storage self.channels = [ Channel() for Channel in brokenpromises.channels.perform_channels_import(channels) ] self.date = (year and int(year) or None, month and int(month) or None, day and int(day) or None) self.force_collect = force_collect self.storage = self.use_storage and Storage() or None self.report_extra = report_extra
def tearDown(self): Storage().get_connection().drop_database(self.test_db)
class TestOperations(unittest.TestCase): '''Test Class''' def setUp(self): original_mongo_uri = settings.MONGODB_URI original_db = original_mongo_uri.split("/")[-1] self.test_db = "test" + original_db test_uri = "/".join( original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db self.testing_storage = Storage(uri=test_uri) def tearDown(self): Storage().get_connection().drop_database(self.test_db) def test_get_articles(self): collector = CollectArticles(("brokenpromises.channels.guardian", ), "2014", "1") results = collector.run() print print "results:", len(results) assert len(results) > 0 for result in results: assert result.ref_dates, "%s : %s" % (result, result.url) assert collector.get_report() assert collector.get_report( ).collector == "brokenpromises.operations.CollectArticles", collector.get_report( ).collector assert collector.get_report().meta['count'] == len(results) assert collector.get_report().meta['related_articles'] <= len(results) assert len(collector.get_report().meta['urls_found']) == len(results) def test_get_articles_with_storage(self): from brokenpromises import Article searched_date = (2014, 1, None) collector = CollectArticles(("brokenpromises.channels.nytimes", ), *searched_date, use_storage=True) # replace storage with custom storage (testing db) collector.storage = self.testing_storage results = collector.run() print print "results:", len(results) assert len(results) > 0 for result in results: assert result.ref_dates, "%s : %s" % (result, result.url) assert collector.get_report() assert collector.get_report( ).collector == "brokenpromises.operations.CollectArticles" assert collector.get_report().meta['count'] == len(results) assert collector.get_report().meta['related_articles'] <= len(results) assert len(collector.get_report().meta['urls_found']) == len(results) assert len( self.testing_storage.get_reports( name="collector", searched_date=searched_date, status="done")) == 1, self.testing_storage.get_reports( searched_date) results = collector.run() assert len(results) > 0, results assert type(results[0]) is Article, type(results[0]) assert len( self.testing_storage.get_reports(searched_date=searched_date)) == 2 assert len( self.testing_storage.get_reports(name="collector", searched_date=searched_date)) == 2 assert len( self.testing_storage.get_reports(name="collector", searched_date=searched_date, status="escaped")) == 1 def test_get_articles_with_queue(self): # need to explicitly import the runnable object from brokenpromises.operations import CollectArticles from brokenpromises.worker import worker collector = CollectArticles(("brokenpromises.channels.guardian", ), "2014", 1, use_storage=False) worker.run(collector) def test_retrieve_referenced_dates(self): dates = ( ("10 October 2013", (2013, 10, 10)), ("10 october, 2013", (2013, 10, 10)), ("4 by October 2013", (2013, 10, 4)), ("10 by October 2013", (2013, 10, 10)), ("10 by October, 2013", (2013, 10, 10)), ("Jan 2014", (2014, 1, None)), ("10 in October 2013", (2013, 10, 10)), ("10 in October, 2013", (2013, 10, 10)), ("10 of October 2013", (2013, 10, 10)), ("10 of October, 2013", (2013, 10, 10)), ("10th October 2013", (2013, 10, 10)), ("10th by October 2013", (2013, 10, 10)), ("10th by October, 2013", (2013, 10, 10)), ("10th in october 2013", (2013, 10, 10)), ("10th in October, 2013", (2013, 10, 10)), ("10th of October 2013", (2013, 10, 10)), ("10th of October, 2013", (2013, 10, 10)), ("2013-10-10", (2013, 10, 10)), ("2013/10/10", (2013, 10, 10)), ("August, 2013", (2013, 8, None)), ("2013", (2013, None, None)), ("November 04, 2013", (2013, 11, 4)), ("November 4, 2013", (2013, 11, 4)), ) text = " bla bli 123. Bu \n pouet12 \n 12412 ".join( [_[0] for _ in dates]) refs = CollectArticles.retrieve_referenced_dates(text) date_found = [_['extracted_date'] for _ in refs] for searched_date in dates: try: ref = filter(lambda _: _["extracted_date"] == searched_date[0], refs)[0] except: raise Exception( "\"%s\" not found in document. Date found:\n%s" % (searched_date[0], "\n".join(date_found))) assert ref['extracted_date'] in searched_date[0] assert ref['date'] == searched_date[1], "%s != %s" % ( ref['date'], searched_date[1]) date_found.remove(ref['extracted_date']) assert len(refs) == len(dates), "%s != %s\nToo much : %s" % ( len(refs), len(dates), date_found)
def __init__(self): super(MrClean, self).__init__() self.storage = Storage()
def setUp(self): original_mongo_uri = settings.MONGODB_URI original_db = original_mongo_uri.split("/")[-1] self.test_db = "test" + original_db test_uri = "/".join(original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db self.testing_storage = Storage(uri=test_uri)
class TestOperations(unittest.TestCase): '''Test Class''' def setUp(self): original_mongo_uri = settings.MONGODB_URI original_db = original_mongo_uri.split("/")[-1] self.test_db = "test" + original_db test_uri = "/".join(original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db self.testing_storage = Storage(uri=test_uri) def tearDown(self): Storage().get_connection().drop_database(self.test_db) def test_get_articles(self): collector = CollectArticles(("brokenpromises.channels.guardian",), "2014", "1") results = collector.run() print print "results:", len(results) assert len(results) > 0 for result in results: assert result.ref_dates, "%s : %s" % (result, result.url) assert collector.get_report() assert collector.get_report().collector == "brokenpromises.operations.CollectArticles", collector.get_report().collector assert collector.get_report().meta['count'] == len(results) assert collector.get_report().meta['related_articles'] <= len(results) assert len(collector.get_report().meta['urls_found']) == len(results) def test_get_articles_with_storage(self): from brokenpromises import Article searched_date = (2014, 1, None) collector = CollectArticles(("brokenpromises.channels.nytimes",), *searched_date, use_storage=True) # replace storage with custom storage (testing db) collector.storage = self.testing_storage results = collector.run() print print "results:", len(results) assert len(results) > 0 for result in results: assert result.ref_dates, "%s : %s" % (result, result.url) assert collector.get_report() assert collector.get_report().collector == "brokenpromises.operations.CollectArticles" assert collector.get_report().meta['count'] == len(results) assert collector.get_report().meta['related_articles'] <= len(results) assert len(collector.get_report().meta['urls_found']) == len(results) assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date, status="done")) == 1, self.testing_storage.get_reports(searched_date) results = collector.run() assert len(results) > 0, results assert type(results[0]) is Article, type(results[0]) assert len(self.testing_storage.get_reports(searched_date=searched_date)) == 2 assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date)) == 2 assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date, status="escaped")) == 1 def test_get_articles_with_queue(self): # need to explicitly import the runnable object from brokenpromises.operations import CollectArticles from brokenpromises.worker import worker collector = CollectArticles(("brokenpromises.channels.guardian",), "2014", 1, use_storage=False) worker.run(collector) def test_retrieve_referenced_dates(self): dates = ( ("10 October 2013" , (2013, 10, 10)), ("10 october, 2013" , (2013, 10, 10)), ("4 by October 2013" , (2013, 10, 4)), ("10 by October 2013" , (2013, 10, 10)), ("10 by October, 2013" , (2013, 10, 10)), ("Jan 2014" , (2014, 1, None)), ("10 in October 2013" , (2013, 10, 10)), ("10 in October, 2013" , (2013, 10, 10)), ("10 of October 2013" , (2013, 10, 10)), ("10 of October, 2013" , (2013, 10, 10)), ("10th October 2013" , (2013, 10, 10)), ("10th by October 2013" , (2013, 10, 10)), ("10th by October, 2013" , (2013, 10, 10)), ("10th in october 2013" , (2013, 10, 10)), ("10th in October, 2013" , (2013, 10, 10)), ("10th of October 2013" , (2013, 10, 10)), ("10th of October, 2013" , (2013, 10, 10)), ("2013-10-10" , (2013, 10, 10)), ("2013/10/10" , (2013, 10, 10)), ("August, 2013" , (2013, 8, None)), ("2013" , (2013, None, None)), ("November 04, 2013" , (2013, 11, 4)), ("November 4, 2013" , (2013, 11, 4)), ) text = " bla bli 123. Bu \n pouet12 \n 12412 ".join([_[0] for _ in dates]) refs = CollectArticles.retrieve_referenced_dates(text) date_found = [_['extracted_date'] for _ in refs] for searched_date in dates: try: ref = filter(lambda _: _["extracted_date"] == searched_date[0], refs)[0] except: raise Exception("\"%s\" not found in document. Date found:\n%s" % (searched_date[0], "\n".join(date_found))) assert ref['extracted_date'] in searched_date[0] assert ref['date'] == searched_date[1], "%s != %s" % (ref['date'], searched_date[1]) date_found.remove(ref['extracted_date']) assert len(refs) == len(dates), "%s != %s\nToo much : %s" % (len(refs), len(dates), date_found)
from flask import Flask, render_template, request, send_file, \ send_from_directory, Response, abort, session, redirect, url_for, make_response, json from flask.ext.assets import Environment from flask.ext.login import LoginManager, login_user, login_required, UserMixin, logout_user from flask.ext.cache import Cache from rq_dashboard import RQDashboard from brokenpromises.storage import Storage from brokenpromises.channels import get_available_channels from brokenpromises.operations import CollectArticlesAndSendEmail from brokenpromises.worker import worker import os import datetime STORAGE = Storage() class CustomFlask(Flask): jinja_options = Flask.jinja_options.copy() jinja_options.update( dict(block_start_string='[%', block_end_string='%]', variable_start_string='[[', variable_end_string=']]', comment_start_string='[#', comment_end_string='#]')) app = CustomFlask(__name__) app.config.from_envvar("WEBAPP_SETTINGS")