def initialize_jobs(bucket_name): setup_context() jobs_count = 0 conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY) bucket = Bucket(connection=conn, name=bucket_name) for key in bucket.list(): queue_job("tasks.Download", { "bucket_name": bucket_name, "key_name": key.key }, queue=download_queue) jobs_count += 1 return jobs_count
import unittest from mrq.context import setup_context, queue_raw_jobs setup_context() class TestSetTask(unittest.TestCase): def setUp(self): """List of URLs as a payload for the set task. Duplicate values will not be added due to Redis Set datatype. """ self.urls = [ ['https://contentstudio.io'], ['https://d4interactive.io'], ['https://techcrunch.com'], ['https://mashable.com'], ['https://mashable.com'] ] self.urls_key_value = [ { 'url': 'https://techcrunch.com' }, { 'url': 'https://mashable.com' }, { 'url': 'https://techcrunch.com' }, ]
Bootstrap(app) app.config.update({ "DEBUG": DEBUG }) db = mongoengine.MongoEngine(app) app.register_blueprint(social_auth) init_social(app, db) app.context_processor(backends) login_manager = login.LoginManager() login_manager.init_app(app) if not get_current_config(): setup_context() @app.route("/data/facebook/albums") @login.login_required def data_facebook_albums(): return json.dumps(g.user.get_facebook_albums()) @app.route("/create_job", methods=["POST"]) # @login.login_required def create_job(): taskpath = request.form['path'] taskparams = json.loads(request.form['params']) if taskpath.startswith("admin"):
from mrq import context from mrq.job import queue_job context.setup_context() result = queue_job("crawler.Fetch", { "url": "http://docs.python-requests.org", "from": "whatever.com" }, queue="crawl") print(result)
stock_parser = subparsers.add_parser('stock', help='Cancel Kasirga Job') stock_parser.add_argument('--goods', dest='goods', action='store', required=True, help='JSON type parameters') stock_parser.add_argument('--action', dest='action', action='store', required=False, default="increase", help='string increase/decrease') def order(arguments): prm = { "product": arguments.product, "quantity": arguments.product, } all_ids = job.queue_jobs("tasks.stock.Stock", prm, queue="low") return all_ids def stock(arguments): goods = json.loads(arguments.goods) goods.update({"action": arguments.action}) all_ids = job.queue_jobs("tasks.stock.Stock", goods, queue="low") return all_ids if __name__ == '__main__': args = parser.parse_args() if args.operation == 'order': setup_context(file_path=BASEPATH + '/config/config.py', config_type='run') order(args) elif args.operation == 'stock': setup_context(file_path=BASEPATH + '/config/config.py', config_type='run') stock(args)
def run(self, params): context.setup_context() collection = connections.mongodb_jobs.simple_crawler_urls response = requests.get(params["url"]) if response.status_code != 200: log.warning( "Got status %s on page %s (Queued from %s)" % (response.status_code, response.url, params.get("from"))) return False # Store redirects if response.url != params["url"]: collection.update({"_id": params["url"]}, { "$set": { "redirected_to": response.url, "fetched_date": datetime.datetime.now() } }) document = lxml.html.fromstring(response.content) document.make_links_absolute(response.url) queued_count = 0 document_domain = urlparse.urlparse(response.url).netloc for (element, attribute, link, pos) in document.iterlinks(): link = re.sub("#.*", "", link or "") if not link: continue domain = urlparse.urlparse(link).netloc # Don't follow external links for this example if domain != document_domain: continue # We don't want to re-queue URLs twice. If we try to insert a duplicate, # pymongo will throw an error try: collection.insert({"_id": link}) except: continue queue_job("crawler.Fetch", { "url": link, "from": params["url"] }, queue="crawl") queued_count += 1 stored_data = { "_id": response.url, "queued_urls": queued_count, "html_length": len(response.content), "fetched_date": datetime.datetime.now() } collection.update({"_id": response.url}, stored_data, upsert=True) return True