Пример #1
0
 def create_collection(self, collection_name):
     try:
         if not self.has_collection(collection_name):
             field1 = FieldSchema(name="id",
                                  dtype=DataType.INT64,
                                  descrition="int64",
                                  is_primary=True,
                                  auto_id=True)
             field2 = FieldSchema(name="embedding",
                                  dtype=DataType.FLOAT_VECTOR,
                                  descrition="float vector",
                                  dim=VECTOR_DIMENSION,
                                  is_primary=False)
             schema = CollectionSchema(fields=[field1, field2],
                                       description="collection description")
             self.collection = Collection(name=collection_name,
                                          schema=schema)
             LOGGER.debug("Create Milvus collection: {}".format(
                 self.collection))
         else:
             self.set_collection(collection_name)
         return "OK"
     except Exception as e:
         LOGGER.error("Failed to load data to Milvus: {}".format(e))
         sys.exit(1)
Пример #2
0
def parse_json_response(json_response: Dict) -> List[Dict]:
    """
    Condense a JSON response down to just essential information about
    posts and remove any posts that have already been parsed in past requests
    """
    posts = json_response["data"]["children"]
    new_posts = []

    for post in posts:
        post = post["data"]
        post_id = post["id"]
        subreddit = post["subreddit"]

        # We haven't seen this post before, parse it and queue it to add to database
        if post_id not in existing_ids():
            new_posts.append({
                "id":
                post_id,
                "title":
                post["title"],
                "url":
                post["url"],
                "subreddit":
                post["subreddit"],
                "username":
                post["author"],
                "created_utc":
                datetime.fromtimestamp(post["created_utc"]),
            })

    if new_posts:
        inserted = add_new_posts(new_posts)
        LOGGER.debug(f"Added {inserted} new posts to {subreddit}")

    return new_posts
Пример #3
0
def filter_results(posts: List[Dict], sub: str) -> List[Dict]:
    search_terms = SEARCH_TERMS[sub]
    reject_terms = REJECT_TERMS[sub]

    filtered = []

    for post in posts:
        # Handle if user doesn't specify search or reject keywords
        matches = [] if search_terms else [True]
        rejections = [] if reject_terms else [False]

        # Check for terms that match what we're looking for
        for search_term in search_terms:
            match = search_term.lower() in post["title"].lower()
            matches.append(match)
            if match:
                if post.get("matches"):
                    post["matches"].append(search_term)
                else:
                    post["matches"] = [search_term]

        # Check for terms that we will reject
        for reject_term in reject_terms:
            rejections.append(reject_term.lower() in post["title"].lower())

        accepted = any(matches) and not any(rejections)
        if accepted:
            filtered.append(post)

    LOGGER.debug(
        f"{sub}: Filtered {len(posts) - len(filtered)}/{len(posts)} posts, kept {len(filtered)}"
    )
    return filtered
Пример #4
0
 def __init__(self):
     try:
         self.collection = None
         connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
         LOGGER.debug("Successfully connect to Milvus with IP:{} and PORT:{}".format(MILVUS_HOST, MILVUS_PORT))
     except Exception as e:
         LOGGER.error("Failed to connect Milvus: {}".format(e))
         sys.exit(1)
Пример #5
0
 def delete_table(self, table_name):
     sql = "drop table if exists " + table_name + ";"
     try:
         self.cursor.execute(sql)
         LOGGER.debug("MYSQL delete table:{}".format(table_name))
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #6
0
 def create_mysql_table(self, table_name):
     self.test_connection()
     sql = "create table if not exists " + table_name + "(milvus_id TEXT, image_path TEXT);"
     try:
         self.cursor.execute(sql)
         LOGGER.debug("MYSQL create table: {} with sql: {}".format(table_name, sql))
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #7
0
 def count(self, collection_name):
     try:
         self.set_collection(collection_name)
         num = self.collection.num_entities
         LOGGER.debug("Successfully get the num:{} of the collection:{}".format(num, collection_name))
         return num
     except Exception as e:
         LOGGER.error("Failed to count vectors in Milvus: {}".format(e))
         sys.exit(1)
Пример #8
0
 def delete_collection(self, collection_name):
     try:
         self.set_collection(collection_name)
         self.collection.drop()
         LOGGER.debug("Successfully drop collection!")
         return "ok"
     except Exception as e:
         LOGGER.error("Failed to drop collection: {}".format(e))
         sys.exit(1)
Пример #9
0
 def create_class_table(self, table_name):
     self.test_connection()
     sql = "create table if not exists " + table_name + "(seq_class TEXT, gene_family TEXT);"
     try:
         self.cursor.execute(sql)
         LOGGER.debug("MYSQL create table: {} with sql: {}".format(
             table_name, sql))
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #10
0
 def load_data_to_mysql(self, table_name, data):
     self.test_connection()
     sql = "insert into " + table_name + " (milvus_id,image_path) values (%s,%s);"
     try:
         self.cursor.executemany(sql, data)
         self.conn.commit()
         LOGGER.debug("MYSQL loads data to table: {} successfully".format(table_name))
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #11
0
 def search_by_question(self, question, table_name):
     sql = "select answer from " + table_name + " where question = '" + question + "';"
     try:
         self.cursor.execute(sql)
         results = self.cursor.fetchall()
         LOGGER.debug("MYSQL search by question.")
         return results[0][0]
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #12
0
 def delete_all_data(self, table_name):
     self.test_connection()
     sql = 'delete from ' + table_name + ';'
     try:
         self.cursor.execute(sql)
         self.conn.commit()
         LOGGER.debug("MYSQL delete all data in table:{}".format(table_name))
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #13
0
 def count_table(self, table_name):
     sql = "select count(milvus_id) from " + table_name + ";"
     try:
         self.cursor.execute(sql)
         results = self.cursor.fetchall()
         LOGGER.debug("MYSQL count table:{}".format(table_name))
         return results[0][0]
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #14
0
 def search_vectors(self, collection_name, vectors, top_k):
     try:
         self.set_collection(collection_name)
         search_params = {"metric_type": METRIC_TYPE, "params": {"nprobe": 16}}
         # data = [vectors]
         res = self.collection.search(vectors, anns_field="embedding", param=search_params, limit=top_k)
         print(res[0])
         LOGGER.debug("Successfully search in collection: {}".format(res))
         return res
     except Exception as e:
         LOGGER.error("Failed to search vectors in Milvus: {}".format(e))
         sys.exit(1)
Пример #15
0
 def search_by_milvus_ids(self, ids, table_name):
     str_ids = str(ids).replace('[', '').replace(']', '')
     sql = "select image_path from " + table_name + " where milvus_id in (" + str_ids + ") order by field (milvus_id," + str_ids + ");"
     try:
         self.cursor.execute(sql)
         results = self.cursor.fetchall()
         results = [res[0] for res in results]
         LOGGER.debug("MYSQL search by milvus id.")
         return results
     except Exception as e:
         LOGGER.error("MYSQL ERROR: {} with sql: {}".format(e, sql))
         sys.exit(1)
Пример #16
0
 def delete_collection(self, collection_name):
     try:
         status = self.client.drop_collection(
             collection_name=collection_name)
         if not status.code:
             LOGGER.debug(
                 "Successfully drop collection: {}".format(collection_name))
             return status
         else:
             raise Exception(status.message)
     except Exception as e:
         LOGGER.error("Failed to drop collection: {}".format(e))
         sys.exit(1)
Пример #17
0
 def insert(self, collection_name, vectors):
     try:
         self.create_collection(collection_name)
         data = [vectors]
         mr = self.collection.insert(data)
         ids = mr.primary_keys
         self.collection.load()
         LOGGER.debug(
             "Insert vectors to Milvus in collection: {} with {} rows".format(collection_name, len(vectors)))
         return ids
     except Exception as e:
         LOGGER.error("Failed to load data to Milvus: {}".format(e))
         sys.exit(1)
Пример #18
0
def do_count(table_name, milvus_cli, mysql_cli):
    if not table_name:
        table_name = DEFAULT_TABLE
    try:
        if not milvus_cli.has_collection(table_name):
            return None
        milvus_num = milvus_cli.count(table_name)
        mysql_num = mysql_cli.count_table(table_name)
        LOGGER.debug("The num of Milvus: {} and Mysql: {}".format(
            milvus_num, mysql_num))
        return milvus_num
    except Exception as e:
        LOGGER.error(" Error with count table {}".format(e))
        sys.exit(1)
Пример #19
0
 def count(self, collection_name):
     try:
         status, num = self.client.count_entities(
             collection_name=collection_name)
         if not status.code:
             LOGGER.debug(
                 "Successfully get the num:{} of the collection:{}".format(
                     num, collection_name))
             return num
         else:
             raise Exception(status.message)
     except Exception as e:
         LOGGER.error("Failed to count vectors in Milvus: {}".format(e))
         sys.exit(1)
Пример #20
0
 def create_index(self, collection_name):
     try:
         self.set_collection(collection_name)
         default_index = {"index_type": "IVF_SQ8", "metric_type": METRIC_TYPE, "params": {"nlist": 16384}}
         status = self.collection.create_index(field_name="embedding", index_params=default_index)
         if not status.code:
             LOGGER.debug(
                 "Successfully create index in collection:{} with param:{}".format(collection_name, default_index))
             return status
         else:
             raise Exception(status.message)
     except Exception as e:
         LOGGER.error("Failed to create index: {}".format(e))
         sys.exit(1)
Пример #21
0
 def insert(self, collection_name, vectors):
     try:
         self.create_colllection(collection_name)
         status, ids = self.client.insert(collection_name=collection_name,
                                          records=vectors)
         if not status.code:
             LOGGER.debug(
                 "Insert vectors to Milvus in collection: {} with {} rows".
                 format(collection_name, len(vectors)))
             return ids
         else:
             raise Exception(status.message)
     except Exception as e:
         LOGGER.error("Failed to load data to Milvus: {}".format(e))
         sys.exit(1)
Пример #22
0
 def create_index(self, collection_name):
     try:
         index_param = {'nlist': 16384}
         status = self.client.create_index(collection_name,
                                           IndexType.IVF_FLAT, index_param)
         if not status.code:
             LOGGER.debug(
                 "Successfully create index in collection:{} with param:{}".
                 format(collection_name, index_param))
             return status
         else:
             raise Exception(status.message)
     except Exception as e:
         LOGGER.error("Failed to create index: {}".format(e))
         sys.exit(1)
Пример #23
0
 def create_index(self, collection_name, index_params):
     try:
         self.set_collection(collection_name)
         status = self.collection.create_index(field_name="embedding",
                                               index_params=index_params)
         if not status.code:
             self.collection.load()
             LOGGER.debug(
                 "Successfully create index in collection:{} with param:{}".
                 format(collection_name, index_params))
             return status
         else:
             raise Exception(status.message)
     except Exception as e:
         LOGGER.error("Failed to create index: {}".format(e))
         sys.exit(1)
Пример #24
0
 def create_colllection(self, collection_name):
     try:
         if not self.has_collection(collection_name):
             collection_param = {
                 'collection_name': collection_name,
                 'dimension': VECTOR_DIMENSION,
                 'index_file_size': INDEX_FILE_SIZE,
                 'metric_type': METRIC_TYPE
             }
             status = self.client.create_collection(collection_param)
             if status.code != 0:
                 raise Exception(status.message)
             LOGGER.debug(
                 "Create Milvus collection: {}".format(collection_name))
     except Exception as e:
         LOGGER.error("Failed to load data to Milvus: {}".format(e))
         sys.exit(1)
Пример #25
0
 def search_vectors(self, collection_name, vectors, top_k):
     try:
         search_param = {'nprobe': 16}
         status, result = self.client.search(
             collection_name=collection_name,
             query_records=vectors,
             top_k=top_k,
             params=search_param)
         if not status.code:
             LOGGER.debug("Successfully search in collection: {}".format(
                 collection_name))
             return result
         else:
             raise Exception(status.message)
     except Exception as e:
         LOGGER.error("Failed to search vectors in Milvus: {}".format(e))
         sys.exit(1)
Пример #26
0
def send_email(subject: str, message: str) -> None:
    msg = EmailMessage()
    msg["Subject"] = subject
    msg["From"] = SENDER_EMAIL
    msg["To"] = RECEIVER_EMAIL
    msg.set_content(message)

    try:
        server = smtplib.SMTP_SSL("smtp.gmail.com", 465)
        server.ehlo()
        authorization = server.login(SENDER_EMAIL, PASSWORD)
        send_message = server.send_message(msg)
        server.close()
        LOGGER.debug(
            f"Successfully sent email notification to {RECEIVER_EMAIL} from {SENDER_EMAIL}"
        )

    except smtplib.SMTPAuthenticationError as auth_error:
        LOGGER.exception(auth_error.smtp_error)
Пример #27
0
def grab_latest() -> List:
    """
    Grab the latest [LIMIT] posts from subreddits inside [SUBREDDITS]
    and filter them down to just a handful of relevant fields and remove
    posts that have already been parsed in the past.
    """
    # TODO: async requests, cuz fuggit
    posts = {}

    for sub in SUBREDDITS:
        url = _get_subreddit_url(sub)
        LOGGER.debug(f"Querying: {url}")
        response = get(url, headers={"User-Agent": USER_AGENT})

        try:
            new_posts = parse_json_response(response.json())

        except Exception as e:
            LOGGER.exception(
                f"Request to {url} failed with code {response.status_code}: {response.reason}"
            )

        # We got some posts we haven't seen before. Let's filter through them
        if new_posts:
            posts[sub] = filter_results(new_posts, sub)

    subject = format_subject(posts)
    message = format_response(posts)
    if not message:
        LOGGER_RESULTS.info("No new posts.")
        return

    LOGGER_RESULTS.info(f"\n{message}")

    if EMAIL_NOTIFICATIONS:
        send_email(subject, message)
Пример #28
0
from sqlite3 import connect
from config import DB_NAME, ROOT_DIR, IN_MEMORY
from logs import LOGGER
from typing import List, Tuple, Dict

if IN_MEMORY:
    CONNECTION = connect(":memory:", check_same_thread=False)
    LOGGER.debug("Running with in-memory database")
else:
    CONNECTION = connect(f"{ROOT_DIR}/{DB_NAME}", check_same_thread=False)
CURSOR = CONNECTION.cursor()

# Initialize tables and clear posts older than ten days
CURSOR.execute(
    """
    CREATE TABLE IF NOT EXISTS posts (
    id text PRIMARY KEY,
    title text,
    url text,
    subreddit text,
    username text,
    create_date text);
"""
)

CURSOR.execute(
    """
    DELETE FROM posts WHERE create_date <= date('now', '-10 day')
"""
)
Пример #29
0
        # We got some posts we haven't seen before. Let's filter through them
        if new_posts:
            posts[sub] = filter_results(new_posts, sub)

    subject = format_subject(posts)
    message = format_response(posts)
    if not message:
        LOGGER_RESULTS.info("No new posts.")
        return

    LOGGER_RESULTS.info(f"\n{message}")

    if EMAIL_NOTIFICATIONS:
        send_email(subject, message)


def _get_subreddit_url(subreddit: str) -> str:
    return f"https://www.reddit.com/r/{subreddit}/new/.json?limit={LIMIT}"


if __name__ == "__main__":
    LOGGER.debug(
        f"App started, configured to run every {FREQUENCY * 60} minutes.\n"
        f"Running for subreddits: {SUBREDDITS} with following matching terms:\n"
        f"\tsearch: {SEARCH_TERMS}\n"
        f"\treject: {REJECT_TERMS}\n"
        f"Notifications being sent to {RECEIVER_EMAIL} and stored to 'results.log'."
    )
    grab_latest()
    SCHED.start()