def create_collection(self, collection_name): try: if not self.has_collection(collection_name): field1 = FieldSchema(name="id", dtype=DataType.INT64, descrition="int64", is_primary=True, auto_id=True) field2 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, descrition="float vector", dim=VECTOR_DIMENSION, is_primary=False) schema = CollectionSchema(fields=[field1, field2], description="collection description") self.collection = Collection(name=collection_name, schema=schema) LOGGER.debug("Create Milvus collection: {}".format( self.collection)) else: self.set_collection(collection_name) return "OK" except Exception as e: LOGGER.error("Failed to load data to Milvus: {}".format(e)) sys.exit(1)
def parse_json_response(json_response: Dict) -> List[Dict]: """ Condense a JSON response down to just essential information about posts and remove any posts that have already been parsed in past requests """ posts = json_response["data"]["children"] new_posts = [] for post in posts: post = post["data"] post_id = post["id"] subreddit = post["subreddit"] # We haven't seen this post before, parse it and queue it to add to database if post_id not in existing_ids(): new_posts.append({ "id": post_id, "title": post["title"], "url": post["url"], "subreddit": post["subreddit"], "username": post["author"], "created_utc": datetime.fromtimestamp(post["created_utc"]), }) if new_posts: inserted = add_new_posts(new_posts) LOGGER.debug(f"Added {inserted} new posts to {subreddit}") return new_posts
def filter_results(posts: List[Dict], sub: str) -> List[Dict]: search_terms = SEARCH_TERMS[sub] reject_terms = REJECT_TERMS[sub] filtered = [] for post in posts: # Handle if user doesn't specify search or reject keywords matches = [] if search_terms else [True] rejections = [] if reject_terms else [False] # Check for terms that match what we're looking for for search_term in search_terms: match = search_term.lower() in post["title"].lower() matches.append(match) if match: if post.get("matches"): post["matches"].append(search_term) else: post["matches"] = [search_term] # Check for terms that we will reject for reject_term in reject_terms: rejections.append(reject_term.lower() in post["title"].lower()) accepted = any(matches) and not any(rejections) if accepted: filtered.append(post) LOGGER.debug( f"{sub}: Filtered {len(posts) - len(filtered)}/{len(posts)} posts, kept {len(filtered)}" ) return filtered
def __init__(self): try: self.collection = None connections.connect(host=MILVUS_HOST, port=MILVUS_PORT) LOGGER.debug("Successfully connect to Milvus with IP:{} and PORT:{}".format(MILVUS_HOST, MILVUS_PORT)) except Exception as e: LOGGER.error("Failed to connect Milvus: {}".format(e)) sys.exit(1)
def delete_table(self, table_name): sql = "drop table if exists " + table_name + ";" try: self.cursor.execute(sql) LOGGER.debug("MYSQL delete table:{}".format(table_name)) except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def create_mysql_table(self, table_name): self.test_connection() sql = "create table if not exists " + table_name + "(milvus_id TEXT, image_path TEXT);" try: self.cursor.execute(sql) LOGGER.debug("MYSQL create table: {} with sql: {}".format(table_name, sql)) except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def count(self, collection_name): try: self.set_collection(collection_name) num = self.collection.num_entities LOGGER.debug("Successfully get the num:{} of the collection:{}".format(num, collection_name)) return num except Exception as e: LOGGER.error("Failed to count vectors in Milvus: {}".format(e)) sys.exit(1)
def delete_collection(self, collection_name): try: self.set_collection(collection_name) self.collection.drop() LOGGER.debug("Successfully drop collection!") return "ok" except Exception as e: LOGGER.error("Failed to drop collection: {}".format(e)) sys.exit(1)
def create_class_table(self, table_name): self.test_connection() sql = "create table if not exists " + table_name + "(seq_class TEXT, gene_family TEXT);" try: self.cursor.execute(sql) LOGGER.debug("MYSQL create table: {} with sql: {}".format( table_name, sql)) except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def load_data_to_mysql(self, table_name, data): self.test_connection() sql = "insert into " + table_name + " (milvus_id,image_path) values (%s,%s);" try: self.cursor.executemany(sql, data) self.conn.commit() LOGGER.debug("MYSQL loads data to table: {} successfully".format(table_name)) except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def search_by_question(self, question, table_name): sql = "select answer from " + table_name + " where question = '" + question + "';" try: self.cursor.execute(sql) results = self.cursor.fetchall() LOGGER.debug("MYSQL search by question.") return results[0][0] except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def delete_all_data(self, table_name): self.test_connection() sql = 'delete from ' + table_name + ';' try: self.cursor.execute(sql) self.conn.commit() LOGGER.debug("MYSQL delete all data in table:{}".format(table_name)) except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def count_table(self, table_name): sql = "select count(milvus_id) from " + table_name + ";" try: self.cursor.execute(sql) results = self.cursor.fetchall() LOGGER.debug("MYSQL count table:{}".format(table_name)) return results[0][0] except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def search_vectors(self, collection_name, vectors, top_k): try: self.set_collection(collection_name) search_params = {"metric_type": METRIC_TYPE, "params": {"nprobe": 16}} # data = [vectors] res = self.collection.search(vectors, anns_field="embedding", param=search_params, limit=top_k) print(res[0]) LOGGER.debug("Successfully search in collection: {}".format(res)) return res except Exception as e: LOGGER.error("Failed to search vectors in Milvus: {}".format(e)) sys.exit(1)
def search_by_milvus_ids(self, ids, table_name): str_ids = str(ids).replace('[', '').replace(']', '') sql = "select image_path from " + table_name + " where milvus_id in (" + str_ids + ") order by field (milvus_id," + str_ids + ");" try: self.cursor.execute(sql) results = self.cursor.fetchall() results = [res[0] for res in results] LOGGER.debug("MYSQL search by milvus id.") return results except Exception as e: LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql)) sys.exit(1)
def delete_collection(self, collection_name): try: status = self.client.drop_collection( collection_name=collection_name) if not status.code: LOGGER.debug( "Successfully drop collection: {}".format(collection_name)) return status else: raise Exception(status.message) except Exception as e: LOGGER.error("Failed to drop collection: {}".format(e)) sys.exit(1)
def insert(self, collection_name, vectors): try: self.create_collection(collection_name) data = [vectors] mr = self.collection.insert(data) ids = mr.primary_keys self.collection.load() LOGGER.debug( "Insert vectors to Milvus in collection: {} with {} rows".format(collection_name, len(vectors))) return ids except Exception as e: LOGGER.error("Failed to load data to Milvus: {}".format(e)) sys.exit(1)
def do_count(table_name, milvus_cli, mysql_cli): if not table_name: table_name = DEFAULT_TABLE try: if not milvus_cli.has_collection(table_name): return None milvus_num = milvus_cli.count(table_name) mysql_num = mysql_cli.count_table(table_name) LOGGER.debug("The num of Milvus: {} and Mysql: {}".format( milvus_num, mysql_num)) return milvus_num except Exception as e: LOGGER.error(" Error with count table {}".format(e)) sys.exit(1)
def count(self, collection_name): try: status, num = self.client.count_entities( collection_name=collection_name) if not status.code: LOGGER.debug( "Successfully get the num:{} of the collection:{}".format( num, collection_name)) return num else: raise Exception(status.message) except Exception as e: LOGGER.error("Failed to count vectors in Milvus: {}".format(e)) sys.exit(1)
def create_index(self, collection_name): try: self.set_collection(collection_name) default_index = {"index_type": "IVF_SQ8", "metric_type": METRIC_TYPE, "params": {"nlist": 16384}} status = self.collection.create_index(field_name="embedding", index_params=default_index) if not status.code: LOGGER.debug( "Successfully create index in collection:{} with param:{}".format(collection_name, default_index)) return status else: raise Exception(status.message) except Exception as e: LOGGER.error("Failed to create index: {}".format(e)) sys.exit(1)
def insert(self, collection_name, vectors): try: self.create_colllection(collection_name) status, ids = self.client.insert(collection_name=collection_name, records=vectors) if not status.code: LOGGER.debug( "Insert vectors to Milvus in collection: {} with {} rows". format(collection_name, len(vectors))) return ids else: raise Exception(status.message) except Exception as e: LOGGER.error("Failed to load data to Milvus: {}".format(e)) sys.exit(1)
def create_index(self, collection_name): try: index_param = {'nlist': 16384} status = self.client.create_index(collection_name, IndexType.IVF_FLAT, index_param) if not status.code: LOGGER.debug( "Successfully create index in collection:{} with param:{}". format(collection_name, index_param)) return status else: raise Exception(status.message) except Exception as e: LOGGER.error("Failed to create index: {}".format(e)) sys.exit(1)
def create_index(self, collection_name, index_params): try: self.set_collection(collection_name) status = self.collection.create_index(field_name="embedding", index_params=index_params) if not status.code: self.collection.load() LOGGER.debug( "Successfully create index in collection:{} with param:{}". format(collection_name, index_params)) return status else: raise Exception(status.message) except Exception as e: LOGGER.error("Failed to create index: {}".format(e)) sys.exit(1)
def create_colllection(self, collection_name): try: if not self.has_collection(collection_name): collection_param = { 'collection_name': collection_name, 'dimension': VECTOR_DIMENSION, 'index_file_size': INDEX_FILE_SIZE, 'metric_type': METRIC_TYPE } status = self.client.create_collection(collection_param) if status.code != 0: raise Exception(status.message) LOGGER.debug( "Create Milvus collection: {}".format(collection_name)) except Exception as e: LOGGER.error("Failed to load data to Milvus: {}".format(e)) sys.exit(1)
def search_vectors(self, collection_name, vectors, top_k): try: search_param = {'nprobe': 16} status, result = self.client.search( collection_name=collection_name, query_records=vectors, top_k=top_k, params=search_param) if not status.code: LOGGER.debug("Successfully search in collection: {}".format( collection_name)) return result else: raise Exception(status.message) except Exception as e: LOGGER.error("Failed to search vectors in Milvus: {}".format(e)) sys.exit(1)
def send_email(subject: str, message: str) -> None: msg = EmailMessage() msg["Subject"] = subject msg["From"] = SENDER_EMAIL msg["To"] = RECEIVER_EMAIL msg.set_content(message) try: server = smtplib.SMTP_SSL("smtp.gmail.com", 465) server.ehlo() authorization = server.login(SENDER_EMAIL, PASSWORD) send_message = server.send_message(msg) server.close() LOGGER.debug( f"Successfully sent email notification to {RECEIVER_EMAIL} from {SENDER_EMAIL}" ) except smtplib.SMTPAuthenticationError as auth_error: LOGGER.exception(auth_error.smtp_error)
def grab_latest() -> List: """ Grab the latest [LIMIT] posts from subreddits inside [SUBREDDITS] and filter them down to just a handful of relevant fields and remove posts that have already been parsed in the past. """ # TODO: async requests, cuz fuggit posts = {} for sub in SUBREDDITS: url = _get_subreddit_url(sub) LOGGER.debug(f"Querying: {url}") response = get(url, headers={"User-Agent": USER_AGENT}) try: new_posts = parse_json_response(response.json()) except Exception as e: LOGGER.exception( f"Request to {url} failed with code {response.status_code}: {response.reason}" ) # We got some posts we haven't seen before. Let's filter through them if new_posts: posts[sub] = filter_results(new_posts, sub) subject = format_subject(posts) message = format_response(posts) if not message: LOGGER_RESULTS.info("No new posts.") return LOGGER_RESULTS.info(f"\n{message}") if EMAIL_NOTIFICATIONS: send_email(subject, message)
from sqlite3 import connect from config import DB_NAME, ROOT_DIR, IN_MEMORY from logs import LOGGER from typing import List, Tuple, Dict if IN_MEMORY: CONNECTION = connect(":memory:", check_same_thread=False) LOGGER.debug("Running with in-memory database") else: CONNECTION = connect(f"{ROOT_DIR}/{DB_NAME}", check_same_thread=False) CURSOR = CONNECTION.cursor() # Initialize tables and clear posts older than ten days CURSOR.execute( """ CREATE TABLE IF NOT EXISTS posts ( id text PRIMARY KEY, title text, url text, subreddit text, username text, create_date text); """ ) CURSOR.execute( """ DELETE FROM posts WHERE create_date <= date('now', '-10 day') """ )
# We got some posts we haven't seen before. Let's filter through them if new_posts: posts[sub] = filter_results(new_posts, sub) subject = format_subject(posts) message = format_response(posts) if not message: LOGGER_RESULTS.info("No new posts.") return LOGGER_RESULTS.info(f"\n{message}") if EMAIL_NOTIFICATIONS: send_email(subject, message) def _get_subreddit_url(subreddit: str) -> str: return f"https://www.reddit.com/r/{subreddit}/new/.json?limit={LIMIT}" if __name__ == "__main__": LOGGER.debug( f"App started, configured to run every {FREQUENCY * 60} minutes.\n" f"Running for subreddits: {SUBREDDITS} with following matching terms:\n" f"\tsearch: {SEARCH_TERMS}\n" f"\treject: {REJECT_TERMS}\n" f"Notifications being sent to {RECEIVER_EMAIL} and stored to 'results.log'." ) grab_latest() SCHED.start()