Пример #1
0
 def setUp(self):
     original_mongo_uri = settings.MONGODB_URI
     original_db = original_mongo_uri.split("/")[-1]
     self.test_db = "test" + original_db
     test_uri = "/".join(
         original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db
     self.testing_storage = Storage(uri=test_uri)
Пример #2
0
class MrClean(Collector):
	"""
	delete articles older than 1 week
	"""

	def __init__(self):
		super(MrClean, self).__init__()
		self.storage = Storage()

	def run(self, **kwargs):
		articles    = self.storage.get_articles()
		week_before = datetime.date.today() - datetime.timedelta(7) # 1 week delta
		removed_articles = []
		for article in articles:
			for ref_date in article.ref_dates:
				year   = ref_date['date'][0]
				month  = ref_date['date'][1] or 12
				day    = ref_date['date'][2] or calendar.monthrange(year, month)[1]
				r_date = datetime.date(year, month, day)
				if r_date >= week_before:
					break
			else:
				removed_articles.append(article.__dict__)
				self.storage.remove_article(article._id)
		self.set_report(
			status           = "done",
			count            = len(removed_articles),
			removed_articles = removed_articles)

		self.storage.save_report(self.get_report())
Пример #3
0
class MrClean(Collector):
    """
	delete articles older than 1 week
	"""
    def __init__(self):
        super(MrClean, self).__init__()
        self.storage = Storage()

    def run(self, **kwargs):
        articles = self.storage.get_articles()
        week_before = datetime.date.today() - datetime.timedelta(
            7)  # 1 week delta
        removed_articles = []
        for article in articles:
            for ref_date in article.ref_dates:
                year = ref_date['date'][0]
                month = ref_date['date'][1] or 12
                day = ref_date['date'][2] or calendar.monthrange(year,
                                                                 month)[1]
                r_date = datetime.date(year, month, day)
                if r_date >= week_before:
                    break
            else:
                removed_articles.append(article.__dict__)
                self.storage.remove_article(article._id)
        self.set_report(status="done",
                        count=len(removed_articles),
                        removed_articles=removed_articles)

        self.storage.save_report(self.get_report())
Пример #4
0
    def __init__(self,
                 channels,
                 year,
                 month=None,
                 day=None,
                 report_extra={},
                 use_storage=False,
                 force_collect=False):
        """
		force_collect : if use_storage is enable, force the collect even if there is already a report for this searched date
		"""
        super(CollectArticles, self).__init__()
        self.use_storage = use_storage
        self.channels = [
            Channel() for Channel in
            brokenpromises.channels.perform_channels_import(channels)
        ]
        self.date = (year and int(year) or None, month and int(month)
                     or None, day and int(day) or None)
        self.force_collect = force_collect
        self.storage = self.use_storage and Storage() or None
        self.report_extra = report_extra
Пример #5
0
 def tearDown(self):
     Storage().get_connection().drop_database(self.test_db)
Пример #6
0
class TestOperations(unittest.TestCase):
    '''Test Class'''
    def setUp(self):
        original_mongo_uri = settings.MONGODB_URI
        original_db = original_mongo_uri.split("/")[-1]
        self.test_db = "test" + original_db
        test_uri = "/".join(
            original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db
        self.testing_storage = Storage(uri=test_uri)

    def tearDown(self):
        Storage().get_connection().drop_database(self.test_db)

    def test_get_articles(self):
        collector = CollectArticles(("brokenpromises.channels.guardian", ),
                                    "2014", "1")
        results = collector.run()
        print
        print "results:", len(results)
        assert len(results) > 0
        for result in results:
            assert result.ref_dates, "%s : %s" % (result, result.url)
        assert collector.get_report()
        assert collector.get_report(
        ).collector == "brokenpromises.operations.CollectArticles", collector.get_report(
        ).collector
        assert collector.get_report().meta['count'] == len(results)
        assert collector.get_report().meta['related_articles'] <= len(results)
        assert len(collector.get_report().meta['urls_found']) == len(results)

    def test_get_articles_with_storage(self):
        from brokenpromises import Article
        searched_date = (2014, 1, None)
        collector = CollectArticles(("brokenpromises.channels.nytimes", ),
                                    *searched_date,
                                    use_storage=True)
        # replace storage with custom storage (testing db)
        collector.storage = self.testing_storage
        results = collector.run()
        print
        print "results:", len(results)
        assert len(results) > 0
        for result in results:
            assert result.ref_dates, "%s : %s" % (result, result.url)
        assert collector.get_report()
        assert collector.get_report(
        ).collector == "brokenpromises.operations.CollectArticles"
        assert collector.get_report().meta['count'] == len(results)
        assert collector.get_report().meta['related_articles'] <= len(results)
        assert len(collector.get_report().meta['urls_found']) == len(results)
        assert len(
            self.testing_storage.get_reports(
                name="collector", searched_date=searched_date,
                status="done")) == 1, self.testing_storage.get_reports(
                    searched_date)
        results = collector.run()
        assert len(results) > 0, results
        assert type(results[0]) is Article, type(results[0])
        assert len(
            self.testing_storage.get_reports(searched_date=searched_date)) == 2
        assert len(
            self.testing_storage.get_reports(name="collector",
                                             searched_date=searched_date)) == 2
        assert len(
            self.testing_storage.get_reports(name="collector",
                                             searched_date=searched_date,
                                             status="escaped")) == 1

    def test_get_articles_with_queue(self):
        # need to explicitly import the runnable object
        from brokenpromises.operations import CollectArticles
        from brokenpromises.worker import worker
        collector = CollectArticles(("brokenpromises.channels.guardian", ),
                                    "2014",
                                    1,
                                    use_storage=False)
        worker.run(collector)

    def test_retrieve_referenced_dates(self):
        dates = (
            ("10 October 2013", (2013, 10, 10)),
            ("10 october, 2013", (2013, 10, 10)),
            ("4 by October 2013", (2013, 10, 4)),
            ("10 by October 2013", (2013, 10, 10)),
            ("10 by October, 2013", (2013, 10, 10)),
            ("Jan 2014", (2014, 1, None)),
            ("10 in October 2013", (2013, 10, 10)),
            ("10 in October, 2013", (2013, 10, 10)),
            ("10 of October 2013", (2013, 10, 10)),
            ("10 of October, 2013", (2013, 10, 10)),
            ("10th October 2013", (2013, 10, 10)),
            ("10th by October 2013", (2013, 10, 10)),
            ("10th by October, 2013", (2013, 10, 10)),
            ("10th in october 2013", (2013, 10, 10)),
            ("10th in October, 2013", (2013, 10, 10)),
            ("10th of October 2013", (2013, 10, 10)),
            ("10th of October, 2013", (2013, 10, 10)),
            ("2013-10-10", (2013, 10, 10)),
            ("2013/10/10", (2013, 10, 10)),
            ("August, 2013", (2013, 8, None)),
            ("2013", (2013, None, None)),
            ("November 04, 2013", (2013, 11, 4)),
            ("November 4, 2013", (2013, 11, 4)),
        )

        text = " bla bli 123. Bu \n pouet12 \n 12412 ".join(
            [_[0] for _ in dates])
        refs = CollectArticles.retrieve_referenced_dates(text)
        date_found = [_['extracted_date'] for _ in refs]
        for searched_date in dates:
            try:
                ref = filter(lambda _: _["extracted_date"] == searched_date[0],
                             refs)[0]
            except:
                raise Exception(
                    "\"%s\" not found in document. Date found:\n%s" %
                    (searched_date[0], "\n".join(date_found)))
            assert ref['extracted_date'] in searched_date[0]
            assert ref['date'] == searched_date[1], "%s != %s" % (
                ref['date'], searched_date[1])
            date_found.remove(ref['extracted_date'])
        assert len(refs) == len(dates), "%s != %s\nToo much : %s" % (
            len(refs), len(dates), date_found)
Пример #7
0
 def __init__(self):
     super(MrClean, self).__init__()
     self.storage = Storage()
Пример #8
0
	def setUp(self):
		original_mongo_uri = settings.MONGODB_URI
		original_db        = original_mongo_uri.split("/")[-1]
		self.test_db       = "test" + original_db
		test_uri           = "/".join(original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db
		self.testing_storage = Storage(uri=test_uri)
Пример #9
0
class TestOperations(unittest.TestCase):
	'''Test Class'''

	def setUp(self):
		original_mongo_uri = settings.MONGODB_URI
		original_db        = original_mongo_uri.split("/")[-1]
		self.test_db       = "test" + original_db
		test_uri           = "/".join(original_mongo_uri.split("/")[0:-1]) + "/" + self.test_db
		self.testing_storage = Storage(uri=test_uri)

	def tearDown(self):
		Storage().get_connection().drop_database(self.test_db)

	def test_get_articles(self):
		collector = CollectArticles(("brokenpromises.channels.guardian",), "2014", "1")
		results   = collector.run()
		print 
		print "results:", len(results)
		assert len(results) > 0
		for result in results:
			assert result.ref_dates, "%s : %s" % (result, result.url)
		assert collector.get_report()
		assert collector.get_report().collector                == "brokenpromises.operations.CollectArticles", collector.get_report().collector
		assert collector.get_report().meta['count']            == len(results)
		assert collector.get_report().meta['related_articles'] <= len(results)
		assert len(collector.get_report().meta['urls_found'])  == len(results)

	def test_get_articles_with_storage(self):
		from brokenpromises import Article
		searched_date = (2014, 1, None)
		collector = CollectArticles(("brokenpromises.channels.nytimes",), *searched_date, use_storage=True)
		# replace storage with custom storage (testing db)
		collector.storage = self.testing_storage
		results           = collector.run()
		print 
		print "results:", len(results)
		assert len(results) > 0
		for result in results:
			assert result.ref_dates, "%s : %s" % (result, result.url)
		assert collector.get_report()
		assert collector.get_report().collector                == "brokenpromises.operations.CollectArticles"
		assert collector.get_report().meta['count']            == len(results)
		assert collector.get_report().meta['related_articles'] <= len(results)
		assert len(collector.get_report().meta['urls_found'])  == len(results)
		assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date, status="done")) == 1, self.testing_storage.get_reports(searched_date)
		results = collector.run()
		assert len(results) > 0, results
		assert type(results[0]) is Article, type(results[0])
		assert len(self.testing_storage.get_reports(searched_date=searched_date)) == 2
		assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date)) == 2
		assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date, status="escaped")) == 1

	def test_get_articles_with_queue(self):
		# need to explicitly import the runnable object
		from brokenpromises.operations import CollectArticles
		from brokenpromises.worker     import worker
		collector = CollectArticles(("brokenpromises.channels.guardian",), "2014", 1, use_storage=False)
		worker.run(collector)

	def test_retrieve_referenced_dates(self):
		dates = (
			("10 October 2013"       , (2013, 10, 10)),
			("10 october, 2013"      , (2013, 10, 10)),
			("4 by October 2013"     , (2013, 10, 4)),
			("10 by October 2013"    , (2013, 10, 10)),
			("10 by October, 2013"   , (2013, 10, 10)),
			("Jan 2014"              , (2014, 1, None)),
			("10 in October 2013"    , (2013, 10, 10)),
			("10 in October, 2013"   , (2013, 10, 10)),
			("10 of October 2013"    , (2013, 10, 10)),
			("10 of October, 2013"   , (2013, 10, 10)),
			("10th October 2013"     , (2013, 10, 10)),
			("10th by October 2013"  , (2013, 10, 10)),
			("10th by October, 2013" , (2013, 10, 10)),
			("10th in october 2013"  , (2013, 10, 10)),
			("10th in October, 2013" , (2013, 10, 10)),
			("10th of October 2013"  , (2013, 10, 10)),
			("10th of October, 2013" , (2013, 10, 10)),
			("2013-10-10"            , (2013, 10, 10)),
			("2013/10/10"            , (2013, 10, 10)),
			("August, 2013"          , (2013, 8, None)),
			("2013"                  , (2013, None, None)),
			("November 04, 2013"     , (2013, 11, 4)),
			("November 4, 2013"      , (2013, 11, 4)),
		)

		text  = " bla bli 123. Bu \n pouet12 \n 12412 ".join([_[0] for _ in dates])
		refs  = CollectArticles.retrieve_referenced_dates(text)
		date_found = [_['extracted_date'] for _ in refs]
		for searched_date in dates:
			try:
				ref = filter(lambda _: _["extracted_date"] == searched_date[0], refs)[0]
			except:
				raise Exception("\"%s\" not found in document. Date found:\n%s" % (searched_date[0], "\n".join(date_found)))
			assert ref['extracted_date'] in searched_date[0]
			assert ref['date']           == searched_date[1], "%s != %s" % (ref['date'], searched_date[1])
			date_found.remove(ref['extracted_date'])
		assert len(refs) == len(dates), "%s != %s\nToo much : %s" % (len(refs), len(dates), date_found)
Пример #10
0
	def __init__(self):
		super(MrClean, self).__init__()
		self.storage = Storage()
Пример #11
0
from flask            import Flask, render_template, request, send_file, \
 send_from_directory, Response, abort, session, redirect, url_for, make_response, json
from flask.ext.assets import Environment
from flask.ext.login import LoginManager, login_user, login_required, UserMixin, logout_user
from flask.ext.cache import Cache
from rq_dashboard import RQDashboard

from brokenpromises.storage import Storage
from brokenpromises.channels import get_available_channels
from brokenpromises.operations import CollectArticlesAndSendEmail
from brokenpromises.worker import worker
import os
import datetime

STORAGE = Storage()


class CustomFlask(Flask):
    jinja_options = Flask.jinja_options.copy()
    jinja_options.update(
        dict(block_start_string='[%',
             block_end_string='%]',
             variable_start_string='[[',
             variable_end_string=']]',
             comment_start_string='[#',
             comment_end_string='#]'))


app = CustomFlask(__name__)
app.config.from_envvar("WEBAPP_SETTINGS")