from celery_base import task from random import random from docker_logs import get_logger logging = get_logger("runner") result = task.delay(random()).get(timeout=10) logging.info(f"Task returned: {result}")
import os import praw from pymagnitude import Magnitude from celery import Celery from influxdb import InfluxDBClient from docker_logs import get_logger logging = get_logger("celery-base") app = Celery() app.conf.update({ 'task_routes': { 'get_subreddit': { 'queue': 'scraper' }, 'get_submission': { 'queue': 'scraper' }, 'put_embeddings': { 'queue': 'embedder' }, 'send_to_mongo': { 'queue': 'mongo' } }, 'task_serializer': 'pickle', 'result_serializer': 'pickle', 'accept_content': ['pickle'] })
"""Embedding worker.""" from celery import Celery from docker_logs import get_logger from mongodb_worker import save_submission import fasttext from nltk.tokenize import WordPunctTokenizer import numpy as np logging = get_logger("embedder") app = Celery('celery_base', broker='amqp://localhost//', backend='amqp') tokenizer = WordPunctTokenizer() model_name = 'dbpedia.bin' ft_model = fasttext.load_model(model_name) @app.task(bind=True) def embedding(self, work, submissions): """Embedds given submissions texts.""" total = sum([len(s["comments"]) for s in submissions]) logging.info(f'{work}: Submissions {len(submissions)} embedded' f' with total {total} comments') for submission in submissions: subm_vectors = [ ft_model[token] for token in tokenizer.tokenize(submission['text']) ] for comment in submission['comments']: subm_vectors.extend( [ft_model[token] for token in tokenizer.tokenize(comment)]) if len(subm_vectors) > 0:
import os from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession from pyspark.ml.linalg import Vectors, VectorUDT from pyspark.sql.functions import udf from pyspark.ml.evaluation import RegressionEvaluator, \ MulticlassClassificationEvaluator from docker_logs import get_logger from utils import get_class_distribution from pipelines import get_linear_regression_pipeline, \ get_binary_classification_pipeline, get_multi_classification_pipeline logger = get_logger("app-spark") conf = SparkConf().setAppName('app-spark').setMaster('local') sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") spark = SparkSession \ .builder \ .appName("app-spark") \ .getOrCreate() mongo_uri = ''.join([ 'mongodb://', os.environ['MONGO_INITDB_ROOT_USERNAME'], ':', os.environ['MONGO_INITDB_ROOT_PASSWORD'],
"""DB worker task.""" from worker import app from docker_logs import get_logger from pymongo import MongoClient logging = get_logger("mongo_db_task") logging.propagate = False client = MongoClient('mongodb:27017') db = client.tweetmldb @app.task(bind=True, name='mongo_task', queue='mongo') def mongo_task(self, collection): """Saves new tweets in mongo db.""" logging.info(f"DB ISERTION FIRED ") if len(collection) > 0: db.posts.insert_many(convert_objects_to_dicts(collection)) logging.info(f"DB SIZE: {db.posts.count()} ") # client.close() def convert_objects_to_dicts(collection): """Converts objects to dictionaries.""" results = [] for tweet in collection: results.append(tweet.__dict__) return results
"""Embedding worker.""" from worker import app from docker_logs import get_logger from pymagnitude import Magnitude import gensim import numpy as np from mongo_task import mongo_task logging = get_logger("embedding_task") logging.propagate = False def preproces_text(text): """Text splitted to tokens.""" return gensim.utils.simple_preprocess(text) def get_sentences_representation(vectors, splitted_sentence): """Counts average embedding.""" length = 0 av_sum = np.zeros(shape=(100, )) for i in range(len(splitted_sentence)): if splitted_sentence[i] in vectors: av_sum = av_sum + vectors.query(splitted_sentence[i]) length += 1 if length > 0: av_sum = av_sum / length return av_sum def get_text_embedding(vectors, text):
from celery import Celery from docker_logs import get_logger logging = get_logger("task") app = Celery() @app.task(bind=True, name='task') def task(self, param): logging.info(f"Celery task executed with param: {param}") return f"Result of task for param {param}"
"""Sample pySpark app.""" from docker_logs import get_logger from pyspark.sql import SparkSession from pyspark.ml.linalg import Vectors from pyspark.sql import Row from pyspark.sql import functions as F from pyspark.ml.feature import QuantileDiscretizer from logistic_regression import logistic_regression from binary_classification import binary_classification from multi_class_classification import multi_class_classification logging = get_logger('spark_worker') spark = SparkSession.builder.appName('MyModels').\ config('spark.mongodb.input.uri', 'mongodb://mongodb:27017/reddits.submissions').\ getOrCreate() def load(dev): """Loads the submissions from MongoDB database.""" logging.info('Loading submissions...') df = dev.read.format('com.mongodb.spark.sql.DefaultSource').load() df.createOrReplaceTempView('submissions') df.printSchema() query = 'select score, upvote_ratio, is_nfsw, text_embedded from \
"""Mongodb management worker.""" import os from celery import Celery from docker_logs import get_logger from pymongo import MongoClient logging = get_logger("mongodb_worker") app = Celery('celery_base', broker='amqp://localhost//', backend='amqp') mongo_client = MongoClient(host=os.environ['MONGODB_HOST'], port=int(os.environ['MONGODB_PORT'])) @app.task(bind=True) def save_submission(self, work, submission): """Saves submission do Mongo database.""" try: db = mongo_client.reddits col = db.submissions s_id = col.insert(submission) logging.info(f'{work}: Submission saved into MongoDB: {s_id}') except Exception as e: logging.error(f'{work}: MongoDB saving error: {e}')
import time from datetime import datetime, timedelta import os import influxdb import prawcore from celery import signature from celery_base import influxdb_client, reddit, app from docker_logs import get_logger from data_models import RedditSubmission logging = get_logger("worker-scraper") @app.on_after_configure.connect def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task(int(os.environ['frequency_s']), get_subreddit.s(), name='subreddits') @app.task(bind=True, name='get_subreddit') def get_subreddit(self): subreddit_name = os.environ['subreddit'] time_diff = int(os.environ['frequency_s']) current_time = datetime.utcnow() time_lower_bound = current_time - timedelta(seconds=time_diff) new_submissions = [] json_metrics = []
"""Test worker task. Will be removed.""" import requests from worker import app from docker_logs import get_logger logging = get_logger("time_log_task") logging.propagate = False @app.task(bind=True, name='time_task', queue='time') def scrap_tweets_from_location(self): """Logs time.""" r = requests.get('http://webapp:5000/since') logging.info(f"CURRENT TIME {r.text} ")
from pymagnitude import Magnitude import numpy as np from celery_base import app from data_models import RedditSubmission from docker_logs import get_logger logger = get_logger("worker-embedder") @app.task(bind=True, name='put_embeddings') def put_embeddings(self, rSubmission: RedditSubmission): vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300') rSubmission.post_title_embedding = np.mean(vecs.query( rSubmission.post_title.split()), axis=0) if len(rSubmission.post_text) > 0: rSubmission.post_text_embedding = np.mean(vecs.query( rSubmission.post_text.split()), axis=0) return rSubmission
import os from dataclasses import asdict import pymongo from celery_base import app from data_models import RedditSubmission from docker_logs import get_logger logging = get_logger("worker-mongo") mongo_uri = ''.join([ os.environ['mongodb_protocol'], '://', os.environ['MONGO_INITDB_ROOT_USERNAME'], ':', os.environ['MONGO_INITDB_ROOT_PASSWORD'], '@', os.environ['mongodb_host'], ':', os.environ['mongodb_port'] ]) # https://pymongo.readthedocs.io/en/stable/faq.html#is-pymongo-fork-safe # myclient = pymongo.MongoClient(mongo_uri) # mydb = myclient["reddit"] # mycol = mydb["submissions"] @app.task(bind=True, name='send_to_mongo') def send_to_mongo(self, rSubmission: RedditSubmission): myclient = pymongo.MongoClient(mongo_uri) mydb = myclient["reddit"] mycol = mydb["submissions"] rSubmission.post_title_embedding = list(
"""Scheduling worker.""" import os from celery_base import app from docker_logs import get_logger logging = get_logger("worker") app.conf.beat_schedule = { 'work_every_n_minutes_new': { 'task': 'celery_base.submissions', 'schedule': float(os.environ['CELERYBEAT_MINUTES_INTERVAL']) * 60.0, 'args': ('worker', 'AskReddit', 50, 'new', float(os.environ['CELERYBEAT_MINUTES_INTERVAL']) * 60.0) } } app.conf.timezone = 'UTC'